diff --git a/catalog/dags/common/extensions.py b/catalog/dags/common/extensions.py index 0a82e1263d..0a1e749551 100644 --- a/catalog/dags/common/extensions.py +++ b/catalog/dags/common/extensions.py @@ -1,12 +1,78 @@ -EXTENSIONS = { - "image": {"jpg", "jpeg", "png", "gif", "bmp", "webp", "tiff", "tif", "svg"}, - "audio": {"mp3", "ogg", "wav", "aiff", "flac", "wma", "mp4", "aac", "m4a", "m4b"}, +import mimetypes + +from common.constants import MEDIA_TYPES as SUPPORTED_TYPES + + +FILETYPE_EQUIVALENTS = { + # Image extensions + "jpeg": "jpg", + "tif": "tiff", + # Audio extensions + "midi": "mid", } +# Partially taken from Wikimedia aliases +# https://doc.wikimedia.org/mediawiki-core/master/php/MimeMap_8php_source.html +MIME_TYPE_ALIASES = { + # Image aliases + "image/x-bmp": "image/bmp", + "image/x-ms-bmp": "image/bmp", + "image/x-png": "image/png", + "image/pjpeg": "image/jpeg", + "image/x-ico": "image/vnd.microsoft.icon", + "image/x-icon": "image/vnd.microsoft.icon", + "image/svg": "image/svg+xml", + "image/x.djvu": "image/vnd.djvu", + "image/x-djvu": "image/vnd.djvu", + "image/jpeg2000": "image/jp2", + "image/jpeg200-image": "image/jp2", + "image/x-jpeg200-image": "image/jp2", + # Audio aliases + "audio/mp3": "audio/mpeg", + "audio/mpeg3": "audio/mpeg", + "audio/x-flac": "audio/flac", + "audio/mid": "audio/midi", + "audio/wav": "audio/x-wav", + "audio/wave": "audio/x-wav", +} + + +mimetypes.add_type("audio/midi", ".mid") +mimetypes.add_type("audio/midi", ".midi") +mimetypes.add_type("audio/x-matroska", ".mka") +mimetypes.add_type("audio/wav", ".wav") + + +class InvalidFiletypeError(Exception): + def __init__(self, actual_filetype: str, expected_filetype: str, message: str = ""): + self.actual_filetype = actual_filetype + self.expected_filetype = expected_filetype + if not message: + message = ( + f"Extracted media type `{self.actual_filetype}` does not match " + f"expected media type `{self.expected_filetype}`." + ) + super().__init__(message) + + +def get_extension_from_mimetype(mime_type: str | None) -> str | None: + if not mime_type: + return + mime_type = MIME_TYPE_ALIASES.get(mime_type, mime_type) + ext = mimetypes.guess_extension(mime_type) + # Removes the leading dot if there is an extension + return ext[1:] if ext else None + + +def extract_filetype(url: str) -> tuple[str | None, str | None]: + """ + Extract the filetype from a media url extension. -def extract_filetype(url: str, media_type: str) -> str | None: - """Extract the filetype from a media url extension.""" - possible_filetype = url.split(".")[-1] - if possible_filetype in EXTENSIONS.get(media_type, {}): - return possible_filetype - return None + Returns only if the media type guessed is a supported type. + """ + if mime_type := mimetypes.guess_type(url)[0]: + media_type, _ = mime_type.split("/") + if media_type in SUPPORTED_TYPES: + filetype = get_extension_from_mimetype(mime_type) + return filetype, media_type + return None, None diff --git a/catalog/dags/common/storage/media.py b/catalog/dags/common/storage/media.py index e977adde3f..fcf8113a71 100644 --- a/catalog/dags/common/storage/media.py +++ b/catalog/dags/common/storage/media.py @@ -4,7 +4,11 @@ from datetime import datetime from common import urls -from common.extensions import extract_filetype +from common.extensions import ( + FILETYPE_EQUIVALENTS, + InvalidFiletypeError, + extract_filetype, +) from common.loader import provider_details as prov from common.storage.tsv_columns import CURRENT_VERSION @@ -39,7 +43,6 @@ COMMON_CRAWL = "commoncrawl" PROVIDER_API = "provider_api" -FILETYPE_EQUIVALENTS = {"jpeg": "jpg", "tif": "tiff"} PG_INTEGER_MAXIMUM = 2147483647 @@ -113,8 +116,10 @@ def clean_media_metadata(self, **media_data) -> dict | None: - add `provider`, - add default `category`, if available. - Raises an error if missing any of the required fields: - `license_info`, `foreign_identifier`, `foreign_landing_url`, or `url`. + Raises an error if missing any of the required fields: `license_info`, + `foreign_identifier`, `foreign_landing_url`, or `url`. Or if an extracted + media type (guessed from extension in the URL) does not match the media type + of the class when the API does not return the filetype. """ for field in [ "license_info", @@ -304,17 +309,22 @@ def _format_raw_tag(self, tag: str) -> dict: def _validate_filetype(self, filetype: str | None, url: str) -> str | None: """ - Extract filetype from the media URL if filetype is None. + Extract filetype from the media URL if filetype is None, check that it + corresponds to the media type of the class, and normalize filetypes + that have variants such as jpg/jpeg and tiff/tif. - Unifies filetypes that have variants such as jpg/jpeg and tiff/tif. :param filetype: Optional filetype string. + :param url: The direct URL to the media. :return: filetype string or None """ - if filetype is None: - filetype = extract_filetype(url, self.media_type) - if self.media_type != "image": - return filetype - return FILETYPE_EQUIVALENTS.get(filetype, filetype) + if filetype is not None: + filetype = filetype.lower() + return FILETYPE_EQUIVALENTS.get(filetype, filetype) + + filetype, extracted_media_type = extract_filetype(url) + if extracted_media_type is not None and extracted_media_type != self.media_type: + raise InvalidFiletypeError(extracted_media_type, self.media_type) + return filetype @staticmethod def _validate_integer(value: int | None) -> int | None: diff --git a/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py b/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py index 3e41c1f5dc..d398d271be 100644 --- a/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py +++ b/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py @@ -115,7 +115,7 @@ import lxml.html as html from common.constants import AUDIO, IMAGE -from common.extensions import EXTENSIONS +from common.extensions import get_extension_from_mimetype from common.licenses import LicenseInfo, get_license_info from common.loader import provider_details as prov from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester @@ -162,7 +162,7 @@ class ReturnProps: query_no_popularity = "imageinfo" # All media info we care about - media_all = "url|user|dimensions|extmetadata|mediatype|size|metadata" + media_all = "url|user|dimensions|extmetadata|mediatype|mime|size|metadata" # Everything minus the metadata, which is only necessary for audio and can # balloon for PDFs which are considered images by Wikimedia media_no_metadata = "url|user|dimensions|extmetadata|mediatype|size" @@ -320,7 +320,7 @@ def get_record_data(self, record): creator, creator_url = self.extract_creator_info(media_info) title = self.extract_title(media_info) filesize = media_info.get("size", 0) # in bytes - filetype = self.extract_file_type(url, valid_media_type) + filetype = get_extension_from_mimetype(media_info.get("mime")) meta_data = self.create_meta_data_dict(record) record_data = { @@ -533,28 +533,6 @@ def extract_category_info(media_info): categories_list = categories_string.split("|") return categories_list - @staticmethod - def extract_file_type(url, media_type): - """ - Extract the filetype from extension in the media url. - - In case of images, we check if the filetype is in the list of valid image - types, so we can ignore other media types considered as videos (eg: .ogv). - """ - image_extensions = EXTENSIONS.get(IMAGE, {}) - if filetype := url.split(".")[-1]: - filetype = filetype.lower() - if ( - media_type == IMAGE and filetype in image_extensions - ) or media_type == AUDIO: - return filetype - - logger.warning( - f"Invalid filetype for `{media_type}` media type: {filetype}" - ) - - return None - @staticmethod def extract_license_info(media_info) -> LicenseInfo | None: license_url = ( diff --git a/catalog/tests/dags/common/storage/test_audio.py b/catalog/tests/dags/common/storage/test_audio.py index 936b8c8460..31eb446195 100644 --- a/catalog/tests/dags/common/storage/test_audio.py +++ b/catalog/tests/dags/common/storage/test_audio.py @@ -2,6 +2,7 @@ import pytest +from common.extensions import InvalidFiletypeError from common.licenses import LicenseInfo from common.storage import audio from tests.dags.common.storage import test_media @@ -61,7 +62,7 @@ def test_AudioStore_add_item_adds_realistic_audio_to_buffer(): audio_store.add_item( foreign_identifier="01", foreign_landing_url="https://audios.org/audio01", - url="https://audios.org/audio01.jpg", + url="https://audios.org/audio01.mp3", license_info=license_info, ingestion_type="provider_api", ) @@ -73,25 +74,25 @@ def test_AudioStore_add_item_adds_multiple_audios_to_buffer(): audio_store.add_item( foreign_identifier="01", foreign_landing_url="https://audios.org/audio01", - url="https://audios.org/audio01.jpg", + url="https://audios.org/audio01.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="02", foreign_landing_url="https://audios.org/audio02", - url="https://audios.org/audio02.jpg", + url="https://audios.org/audio02.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="03", foreign_landing_url="https://audios.org/audio03", - url="https://audios.org/audio03.jpg", + url="https://audios.org/audio03.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="04", foreign_landing_url="https://audios.org/audio04", - url="https://audios.org/audio04.jpg", + url="https://audios.org/audio04.mp3", license_info=PD_LICENSE_INFO, ) assert len(audio_store._media_buffer) == 4 @@ -105,25 +106,25 @@ def test_AudioStore_add_item_flushes_buffer(tmpdir): audio_store.add_item( foreign_identifier="01", foreign_landing_url="https://audios.org/audio01", - url="https://audios.org/audio01.jpg", + url="https://audios.org/audio01.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="02", foreign_landing_url="https://audios.org/audio02", - url="https://audios.org/audio02.jpg", + url="https://audios.org/audio02.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="03", foreign_landing_url="https://audios.org/audio03", - url="https://audios.org/audio03.jpg", + url="https://audios.org/audio03.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="04", foreign_landing_url="https://audios.org/audio04", - url="https://audios.org/audio04.jpg", + url="https://audios.org/audio04.mp3", license_info=PD_LICENSE_INFO, ) assert len(audio_store._media_buffer) == 1 @@ -142,24 +143,69 @@ def test_AudioStore_produces_correct_total_audios(): audio_store.add_item( foreign_identifier="01", foreign_landing_url="https://audios.org/audio01", - url="https://audios.org/audio01.jpg", + url="https://audios.org/audio01.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="02", foreign_landing_url="https://audios.org/audio02", - url="https://audios.org/audio02.jpg", + url="https://audios.org/audio02.mp3", license_info=PD_LICENSE_INFO, ) audio_store.add_item( foreign_identifier="03", foreign_landing_url="https://audios.org/audio03", - url="https://audios.org/audio03.jpg", + url="https://audios.org/audio03.mp3", license_info=PD_LICENSE_INFO, ) assert audio_store.total_items == 3 +@pytest.mark.parametrize( + "filetype, url, expected_filetype", + [ + # The value provided prevails over the url extension + ("ogg", "http://example.com/audio", "ogg"), + ("ogg", "http://example.com/audio.wav", "ogg"), + # The filetype is guessed from the URL extension + (None, "http://example.com/audio.mp3", "mp3"), + (None, "http://example.com/audio.WAV", "wav"), + (None, "http://example.com/audio.mid", "mid"), + # Unifies filetypes + ("midi", "http://example.com/audio.mid", "mid"), + (None, "http://example.com/audio.midi", "mid"), + ], +) +def test_AudioStore_validate_filetype(filetype, url, expected_filetype): + audio_store = audio.MockAudioStore("test_provider") + actual_filetype = audio_store._validate_filetype(filetype, url) + assert actual_filetype == expected_filetype + + +@pytest.mark.parametrize( + "url", + [ + "https://example.com/non-audio.apng", + "https://example.com/non-audio.avif", + "https://example.com/non-audio.bmp", + "https://example.com/non-audio.djvu", + "https://example.com/non-audio.gif", + "https://example.com/non-audio.ICO", + "https://example.com/non-audio.jpg", + "https://example.com/non-audio.Jpeg", + "https://example.com/non-audio.png", + "https://example.com/non-audio.svg", + "https://example.com/non-audio.tif", + "https://example.com/non-audio.tiFF", + "https://example.com/non-audio.webp", + ], +) +def test_AudioStore_validate_filetype_raises_error_on_invalid_filetype(url): + image_store = audio.MockAudioStore("test_provider") + with pytest.raises(InvalidFiletypeError): + image_store._validate_filetype(filetype=None, url=url) + + @test_media.INT_MAX_PARAMETERIZATION def test_AudioStore_get_audio_validates_duration(value, expected): audio_store = audio.AudioStore("test_provider") diff --git a/catalog/tests/dags/common/storage/test_media.py b/catalog/tests/dags/common/storage/test_media.py index 6969c657c1..82d7a6d195 100644 --- a/catalog/tests/dags/common/storage/test_media.py +++ b/catalog/tests/dags/common/storage/test_media.py @@ -9,6 +9,7 @@ import pytest from common import urls +from common.extensions import InvalidFiletypeError from common.licenses import LicenseInfo from common.loader import provider_details as prov from common.storage import image, media @@ -528,7 +529,6 @@ def test_MediaStore_get_image_enriches_singleton_tags(): (None, "https://example.com/image.jpg", "jpg"), (None, "https://example.com/image.jpeg", "jpg"), (None, "https://example.com/image.tif", "tiff"), - (None, "https://example.com/image.mp3", None), ("jpeg", "https://example.com/image.gif", "jpg"), ], ) @@ -606,6 +606,53 @@ def test_MediaStore_validate_integer(value, expected): assert actual == expected +@pytest.mark.parametrize( + "filetype, url, expected_filetype", + [ + # The value provided prevails over the url extension + ("jpg", "http://example.com/image.bmp", "jpg"), + # Lowercase the filetype + ("JPG", "http://example.com/image.bmp", "jpg"), + # The filetype is guessed from the URL extension + (None, "http://example.com/image.jpg", "jpg"), + (None, "http://example.com/image.PNG", "png"), + # Unifies filetypes + ("jpeg", "http://example.com/image.jpeg", "jpg"), + (None, "http://example.com/image.jpeg", "jpg"), + ("tif", "http://example.com/image.tif", "tiff"), + (None, "http://example.com/image.tif", "tiff"), + ], +) +def test_MediaStore_validate_filetype(filetype, url, expected_filetype): + image_store = image.MockImageStore("test_provider") + actual = image_store._validate_filetype(filetype=filetype, url=url) + assert actual == expected_filetype + + +@pytest.mark.parametrize( + "url", + [ + "https://example.com/non-image.aif", + "https://example.com/non-image.aiff", + "https://example.com/non-image.flac", + "https://example.com/non-image.m4a", + "https://example.com/non-image.mid", + "https://example.com/non-image.midi", + "https://example.com/non-image.mka", + "https://example.com/non-image.mp3", + "https://example.com/non-image.ogg", + "https://example.com/non-image.opus", + "https://example.com/non-image.wav", + ], +) +def test_MediaStore_validate_filetype_raises_error_on_invalid_filetype(url): + image_store = image.MockImageStore("test_provider") + msg = "Extracted media type `audio` does not match expected media type `image`." + + with pytest.raises(InvalidFiletypeError, match=msg): + image_store._validate_filetype(filetype=None, url=url) + + @INT_MAX_PARAMETERIZATION def test_MediaStore_validates_filesize(value, expected): image_store = image.MockImageStore("test_provider") diff --git a/catalog/tests/dags/common/test_extensions.py b/catalog/tests/dags/common/test_extensions.py index e1a0213818..f55a923fe4 100644 --- a/catalog/tests/dags/common/test_extensions.py +++ b/catalog/tests/dags/common/test_extensions.py @@ -4,57 +4,75 @@ @pytest.mark.parametrize( - "url, media_type, filetype", + "url, expected_media_type, expected_filetype", [ - ("https://example.com/test.jpg", "image", "jpg"), - ("https://example.com/test.m4b", "audio", "m4b"), + # Valid images + ("https://example.com/image.apng", "image", "apng"), + ("https://example.com/image.avif", "image", "avif"), + ("https://example.com/image.bmp", "image", "bmp"), + ("https://example.com/image.djvu", "image", "djvu"), + ("https://example.com/image.gif", "image", "gif"), + ("https://example.com/image.ICO", "image", "ico"), + ("https://example.com/image.jpg", "image", "jpg"), + ("https://example.com/image.Jpeg", "image", "jpg"), + ("https://example.com/image.png", "image", "png"), + ("https://example.com/image.svg", "image", "svg"), + ("https://example.com/image.tif", "image", "tiff"), + ("https://example.com/image.tiFF", "image", "tiff"), + ("https://example.com/image.webp", "image", "webp"), + # Valid audio + ("https://example.com/audio.aif", "audio", "aif"), + ("https://example.com/audio.aiff", "audio", "aif"), + ("https://example.com/audio.flac", "audio", "flac"), + ("https://example.com/audio.m4a", "audio", "m4a"), + ("https://example.com/audio.m3u", "audio", "m3u"), + ("https://example.com/audio.mid", "audio", "mid"), + ("https://example.com/audio.midi", "audio", "mid"), + ("https://example.com/audio.mka", "audio", "mka"), + ("https://example.com/audio.mp3", "audio", "mp3"), + ("https://example.com/audio.ogg", "audio", "oga"), + ("https://example.com/audio.opus", "audio", "oga"), + ("https://example.com/audio.wav", "audio", "wav"), + # Invalid cases + ("https://example.com/test.jpg.image", None, None), + ("https://example.com/video.ogv", None, None), + ("https://example.com/doc.pdf", None, None), + ("https://example.com/test.stl", None, None), + ("https://example.com/test.xyz", None, None), + ("https://example.com/test123", None, None), ], ) -def test_extract_filetype_returns_filetype_for_media_type(url, media_type, filetype): - expected_extension = extensions.extract_filetype(url, media_type) - assert expected_extension == filetype - - -@pytest.mark.parametrize( - "url, media_type", - [ - ("https://example.com/test.jpg.image", "image"), - ("https://example.com/test123", "audio"), - ], -) -def test_extract_filetype_returns_None_if_no_extension_in_url(url, media_type): - expected_extension = extensions.extract_filetype(url, media_type) - assert expected_extension is None +def test_extract_filetype_returns_for_supported_media_type( + url, expected_media_type, expected_filetype +): + actual_filetype, actual_media_type = extensions.extract_filetype(url) + assert actual_filetype == expected_filetype + assert actual_media_type == expected_media_type @pytest.mark.parametrize( - "url, wrong_media_type, correct_media_type, filetype", + "input_mime, expected_value", [ - ("https://example.com/test.jpg", "audio", "image", "jpg"), - ("https://example.com/test.mp3", "image", "audio", "mp3"), + (None, None), + # Image file types + ("image/gif", "gif"), + ("image/jpeg", "jpg"), + ("image/svg+xml", "svg"), + ("image/x-ico", "ico"), + ("image/x.djvu", "djvu"), + ("image/x-djvu", "djvu"), + # Audio file types + ("audio/flac", "flac"), + ("audio/x-flac", "flac"), + ("audio/midi", "mid"), + ("audio/mp3", "mp3"), + ("audio/mpeg3", "mp3"), + ("audio/ogg", "oga"), + ("audio/opus", "opus"), + ("audio/wav", "wav"), + ("audio/x-wav", "wav"), + ("audio/x-matroska", "mka"), ], ) -def test_extract_filetype_returns_only_corresponding_mediatype_filetype( - url, wrong_media_type, correct_media_type, filetype -): - """ - We check that the filetype exists for other media types, but returns None - for the specific media type we are testing. - """ - expected_extension = extensions.extract_filetype(url, wrong_media_type) - assert expected_extension is None - assert extensions.extract_filetype(url, correct_media_type) == filetype - - -def test_extract_filetype_returns_None_for_invalid_media_type(): - """ - This test specifically adds valid extensions for the media types we plan to add. - It is expected that this test will fail if we add more media types. - """ - assert extensions.extract_filetype("https://example.com/test.mp4", "video") is None - assert ( - extensions.extract_filetype("https://example.com/test.stl", "model_3D") is None - ) - assert ( - extensions.extract_filetype("https://example.com/test.sdd", "nomedia") is None - ) +def test_get_extension_from_mimetype(input_mime, expected_value): + assert extensions.get_extension_from_mimetype(input_mime) == expected_value diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py b/catalog/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py index c9fd76bfe5..501f86f7ba 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py @@ -133,16 +133,16 @@ def get_timestamp_query_params(self, start, end, **kwargs): MOCK_RECORD_DATA_LIST = [ { "foreign_identifier": 101, - "foreign_landing_url": "https://clevelandart.org/art/1335.1917", + "foreign_landing_url": "https://commons.wikimedia.org/w/index.php?curid=120786580", "media_type": "audio", "license_info": LICENSE_INFO, - "url": "https://openaccess-cdn.clevelandart.org/1335.1917/1335.1917_web.jpg", # noqa: E501 + "url": "https://upload.wikimedia.org/wikipedia/commons/4/4e/Nl-Javaanse_herten.ogg", # noqa: E501 }, { "foreign_identifier": 100, - "foreign_landing_url": "https://clevelandart.org/art/1916.586.a", + "foreign_landing_url": "https://commons.wikimedia.org/w/index.php?curid=81754323", "media_type": "image", "license_info": LICENSE_INFO, - "url": "https://openaccess-cdn.clevelandart.org/1916.586.a/1916.586.a_web.jpg", # noqa: E501 + "url": "https://upload.wikimedia.org/wikipedia/commons/2/25/20120925_PlozevetBretagne_LoneTree_DSC07971_PtrQs.jpg", # noqa: E501 }, ] diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_data_example.json b/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_data_example.json index f3afb53a0a..dc3ec8145e 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_data_example.json +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_data_example.json @@ -89,6 +89,7 @@ }, "height": 3102, "mediatype": "BITMAP", + "mime": "image/jpeg", "size": 11863148, "url": "https://upload.wikimedia.org/wikipedia/commons/2/25/20120925_PlozevetBretagne_LoneTree_DSC07971_PtrQs.jpg", "user": "PtrQs", diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_info_from_example_data.json b/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_info_from_example_data.json index acb35ee7a0..604f701bec 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_info_from_example_data.json +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/wikimedia/image_info_from_example_data.json @@ -87,6 +87,7 @@ }, "height": 3102, "mediatype": "BITMAP", + "mime": "image/jpeg", "size": 11863148, "url": "https://upload.wikimedia.org/wikipedia/commons/2/25/20120925_PlozevetBretagne_LoneTree_DSC07971_PtrQs.jpg", "user": "PtrQs", diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_wikimedia_commons.py b/catalog/tests/dags/providers/provider_api_scripts/test_wikimedia_commons.py index 6ac554fa68..32bf43761a 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_wikimedia_commons.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_wikimedia_commons.py @@ -399,27 +399,6 @@ def test_extract_creator_info_handles_link_as_partial_text(wmc): assert expect_creator_url == actual_creator_url -@pytest.mark.parametrize( - "url, media_type, expected", - [ - # Valid images - ("https://example.com/image.jpg", "image", "jpg"), - ("https://example.com/image.JpeG", "image", "jpeg"), - ("https://example.com/image.Png", "image", "png"), - ("https://example.com/image.GIF", "image", "gif"), - # Invalid (for our sake) images - ("https://example.com/image.ogv", "image", None), - ("https://example.com/image.xyz", "image", None), - # Valid audio - ("https://example.com/audio.mp3", "audio", "mp3"), - ("https://example.com/audio.ogg", "audio", "ogg"), - ("https://example.com/audio.WAV", "audio", "wav"), - ], -) -def test_extract_file_type(wmc, url, media_type, expected): - assert wmc.extract_file_type(url, media_type) == expected - - def test_extract_license_info_finds_license_url(wmc): image_info = _get_resource_json("image_info_from_example_data.json") expect_license_url = "https://creativecommons.org/licenses/by-sa/4.0/"