Skip to content

Commit

Permalink
Raise exception for misclassified items in MediaStore and get filetyp…
Browse files Browse the repository at this point in the history
…e from Wikimedia's API (#4785)
  • Loading branch information
krysal authored Sep 12, 2024
1 parent b6fa7b9 commit 6c7e0a6
Show file tree
Hide file tree
Showing 10 changed files with 274 additions and 128 deletions.
84 changes: 75 additions & 9 deletions catalog/dags/common/extensions.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,78 @@
EXTENSIONS = {
"image": {"jpg", "jpeg", "png", "gif", "bmp", "webp", "tiff", "tif", "svg"},
"audio": {"mp3", "ogg", "wav", "aiff", "flac", "wma", "mp4", "aac", "m4a", "m4b"},
import mimetypes

from common.constants import MEDIA_TYPES as SUPPORTED_TYPES


FILETYPE_EQUIVALENTS = {
# Image extensions
"jpeg": "jpg",
"tif": "tiff",
# Audio extensions
"midi": "mid",
}

# Partially taken from Wikimedia aliases
# https://doc.wikimedia.org/mediawiki-core/master/php/MimeMap_8php_source.html
MIME_TYPE_ALIASES = {
# Image aliases
"image/x-bmp": "image/bmp",
"image/x-ms-bmp": "image/bmp",
"image/x-png": "image/png",
"image/pjpeg": "image/jpeg",
"image/x-ico": "image/vnd.microsoft.icon",
"image/x-icon": "image/vnd.microsoft.icon",
"image/svg": "image/svg+xml",
"image/x.djvu": "image/vnd.djvu",
"image/x-djvu": "image/vnd.djvu",
"image/jpeg2000": "image/jp2",
"image/jpeg200-image": "image/jp2",
"image/x-jpeg200-image": "image/jp2",
# Audio aliases
"audio/mp3": "audio/mpeg",
"audio/mpeg3": "audio/mpeg",
"audio/x-flac": "audio/flac",
"audio/mid": "audio/midi",
"audio/wav": "audio/x-wav",
"audio/wave": "audio/x-wav",
}


mimetypes.add_type("audio/midi", ".mid")
mimetypes.add_type("audio/midi", ".midi")
mimetypes.add_type("audio/x-matroska", ".mka")
mimetypes.add_type("audio/wav", ".wav")


class InvalidFiletypeError(Exception):
def __init__(self, actual_filetype: str, expected_filetype: str, message: str = ""):
self.actual_filetype = actual_filetype
self.expected_filetype = expected_filetype
if not message:
message = (
f"Extracted media type `{self.actual_filetype}` does not match "
f"expected media type `{self.expected_filetype}`."
)
super().__init__(message)


def get_extension_from_mimetype(mime_type: str | None) -> str | None:
if not mime_type:
return
mime_type = MIME_TYPE_ALIASES.get(mime_type, mime_type)
ext = mimetypes.guess_extension(mime_type)
# Removes the leading dot if there is an extension
return ext[1:] if ext else None


def extract_filetype(url: str) -> tuple[str | None, str | None]:
"""
Extract the filetype from a media url extension.
def extract_filetype(url: str, media_type: str) -> str | None:
"""Extract the filetype from a media url extension."""
possible_filetype = url.split(".")[-1]
if possible_filetype in EXTENSIONS.get(media_type, {}):
return possible_filetype
return None
Returns only if the media type guessed is a supported type.
"""
if mime_type := mimetypes.guess_type(url)[0]:
media_type, _ = mime_type.split("/")
if media_type in SUPPORTED_TYPES:
filetype = get_extension_from_mimetype(mime_type)
return filetype, media_type
return None, None
32 changes: 21 additions & 11 deletions catalog/dags/common/storage/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
from datetime import datetime

from common import urls
from common.extensions import extract_filetype
from common.extensions import (
FILETYPE_EQUIVALENTS,
InvalidFiletypeError,
extract_filetype,
)
from common.loader import provider_details as prov
from common.storage.tsv_columns import CURRENT_VERSION

Expand Down Expand Up @@ -39,7 +43,6 @@
COMMON_CRAWL = "commoncrawl"
PROVIDER_API = "provider_api"

FILETYPE_EQUIVALENTS = {"jpeg": "jpg", "tif": "tiff"}
PG_INTEGER_MAXIMUM = 2147483647


Expand Down Expand Up @@ -113,8 +116,10 @@ def clean_media_metadata(self, **media_data) -> dict | None:
- add `provider`,
- add default `category`, if available.
Raises an error if missing any of the required fields:
`license_info`, `foreign_identifier`, `foreign_landing_url`, or `url`.
Raises an error if missing any of the required fields: `license_info`,
`foreign_identifier`, `foreign_landing_url`, or `url`. Or if an extracted
media type (guessed from extension in the URL) does not match the media type
of the class when the API does not return the filetype.
"""
for field in [
"license_info",
Expand Down Expand Up @@ -304,17 +309,22 @@ def _format_raw_tag(self, tag: str) -> dict:

def _validate_filetype(self, filetype: str | None, url: str) -> str | None:
"""
Extract filetype from the media URL if filetype is None.
Extract filetype from the media URL if filetype is None, check that it
corresponds to the media type of the class, and normalize filetypes
that have variants such as jpg/jpeg and tiff/tif.
Unifies filetypes that have variants such as jpg/jpeg and tiff/tif.
:param filetype: Optional filetype string.
:param url: The direct URL to the media.
:return: filetype string or None
"""
if filetype is None:
filetype = extract_filetype(url, self.media_type)
if self.media_type != "image":
return filetype
return FILETYPE_EQUIVALENTS.get(filetype, filetype)
if filetype is not None:
filetype = filetype.lower()
return FILETYPE_EQUIVALENTS.get(filetype, filetype)

filetype, extracted_media_type = extract_filetype(url)
if extracted_media_type is not None and extracted_media_type != self.media_type:
raise InvalidFiletypeError(extracted_media_type, self.media_type)
return filetype

@staticmethod
def _validate_integer(value: int | None) -> int | None:
Expand Down
28 changes: 3 additions & 25 deletions catalog/dags/providers/provider_api_scripts/wikimedia_commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
import lxml.html as html

from common.constants import AUDIO, IMAGE
from common.extensions import EXTENSIONS
from common.extensions import get_extension_from_mimetype
from common.licenses import LicenseInfo, get_license_info
from common.loader import provider_details as prov
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
Expand Down Expand Up @@ -162,7 +162,7 @@ class ReturnProps:
query_no_popularity = "imageinfo"

# All media info we care about
media_all = "url|user|dimensions|extmetadata|mediatype|size|metadata"
media_all = "url|user|dimensions|extmetadata|mediatype|mime|size|metadata"
# Everything minus the metadata, which is only necessary for audio and can
# balloon for PDFs which are considered images by Wikimedia
media_no_metadata = "url|user|dimensions|extmetadata|mediatype|size"
Expand Down Expand Up @@ -320,7 +320,7 @@ def get_record_data(self, record):
creator, creator_url = self.extract_creator_info(media_info)
title = self.extract_title(media_info)
filesize = media_info.get("size", 0) # in bytes
filetype = self.extract_file_type(url, valid_media_type)
filetype = get_extension_from_mimetype(media_info.get("mime"))
meta_data = self.create_meta_data_dict(record)

record_data = {
Expand Down Expand Up @@ -533,28 +533,6 @@ def extract_category_info(media_info):
categories_list = categories_string.split("|")
return categories_list

@staticmethod
def extract_file_type(url, media_type):
"""
Extract the filetype from extension in the media url.
In case of images, we check if the filetype is in the list of valid image
types, so we can ignore other media types considered as videos (eg: .ogv).
"""
image_extensions = EXTENSIONS.get(IMAGE, {})
if filetype := url.split(".")[-1]:
filetype = filetype.lower()
if (
media_type == IMAGE and filetype in image_extensions
) or media_type == AUDIO:
return filetype

logger.warning(
f"Invalid filetype for `{media_type}` media type: {filetype}"
)

return None

@staticmethod
def extract_license_info(media_info) -> LicenseInfo | None:
license_url = (
Expand Down
70 changes: 58 additions & 12 deletions catalog/tests/dags/common/storage/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from common.extensions import InvalidFiletypeError
from common.licenses import LicenseInfo
from common.storage import audio
from tests.dags.common.storage import test_media
Expand Down Expand Up @@ -61,7 +62,7 @@ def test_AudioStore_add_item_adds_realistic_audio_to_buffer():
audio_store.add_item(
foreign_identifier="01",
foreign_landing_url="https://audios.org/audio01",
url="https://audios.org/audio01.jpg",
url="https://audios.org/audio01.mp3",
license_info=license_info,
ingestion_type="provider_api",
)
Expand All @@ -73,25 +74,25 @@ def test_AudioStore_add_item_adds_multiple_audios_to_buffer():
audio_store.add_item(
foreign_identifier="01",
foreign_landing_url="https://audios.org/audio01",
url="https://audios.org/audio01.jpg",
url="https://audios.org/audio01.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="02",
foreign_landing_url="https://audios.org/audio02",
url="https://audios.org/audio02.jpg",
url="https://audios.org/audio02.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="03",
foreign_landing_url="https://audios.org/audio03",
url="https://audios.org/audio03.jpg",
url="https://audios.org/audio03.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="04",
foreign_landing_url="https://audios.org/audio04",
url="https://audios.org/audio04.jpg",
url="https://audios.org/audio04.mp3",
license_info=PD_LICENSE_INFO,
)
assert len(audio_store._media_buffer) == 4
Expand All @@ -105,25 +106,25 @@ def test_AudioStore_add_item_flushes_buffer(tmpdir):
audio_store.add_item(
foreign_identifier="01",
foreign_landing_url="https://audios.org/audio01",
url="https://audios.org/audio01.jpg",
url="https://audios.org/audio01.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="02",
foreign_landing_url="https://audios.org/audio02",
url="https://audios.org/audio02.jpg",
url="https://audios.org/audio02.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="03",
foreign_landing_url="https://audios.org/audio03",
url="https://audios.org/audio03.jpg",
url="https://audios.org/audio03.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="04",
foreign_landing_url="https://audios.org/audio04",
url="https://audios.org/audio04.jpg",
url="https://audios.org/audio04.mp3",
license_info=PD_LICENSE_INFO,
)
assert len(audio_store._media_buffer) == 1
Expand All @@ -142,24 +143,69 @@ def test_AudioStore_produces_correct_total_audios():
audio_store.add_item(
foreign_identifier="01",
foreign_landing_url="https://audios.org/audio01",
url="https://audios.org/audio01.jpg",
url="https://audios.org/audio01.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="02",
foreign_landing_url="https://audios.org/audio02",
url="https://audios.org/audio02.jpg",
url="https://audios.org/audio02.mp3",
license_info=PD_LICENSE_INFO,
)
audio_store.add_item(
foreign_identifier="03",
foreign_landing_url="https://audios.org/audio03",
url="https://audios.org/audio03.jpg",
url="https://audios.org/audio03.mp3",
license_info=PD_LICENSE_INFO,
)
assert audio_store.total_items == 3


@pytest.mark.parametrize(
"filetype, url, expected_filetype",
[
# The value provided prevails over the url extension
("ogg", "http://example.com/audio", "ogg"),
("ogg", "http://example.com/audio.wav", "ogg"),
# The filetype is guessed from the URL extension
(None, "http://example.com/audio.mp3", "mp3"),
(None, "http://example.com/audio.WAV", "wav"),
(None, "http://example.com/audio.mid", "mid"),
# Unifies filetypes
("midi", "http://example.com/audio.mid", "mid"),
(None, "http://example.com/audio.midi", "mid"),
],
)
def test_AudioStore_validate_filetype(filetype, url, expected_filetype):
audio_store = audio.MockAudioStore("test_provider")
actual_filetype = audio_store._validate_filetype(filetype, url)
assert actual_filetype == expected_filetype


@pytest.mark.parametrize(
"url",
[
"https://example.com/non-audio.apng",
"https://example.com/non-audio.avif",
"https://example.com/non-audio.bmp",
"https://example.com/non-audio.djvu",
"https://example.com/non-audio.gif",
"https://example.com/non-audio.ICO",
"https://example.com/non-audio.jpg",
"https://example.com/non-audio.Jpeg",
"https://example.com/non-audio.png",
"https://example.com/non-audio.svg",
"https://example.com/non-audio.tif",
"https://example.com/non-audio.tiFF",
"https://example.com/non-audio.webp",
],
)
def test_AudioStore_validate_filetype_raises_error_on_invalid_filetype(url):
image_store = audio.MockAudioStore("test_provider")
with pytest.raises(InvalidFiletypeError):
image_store._validate_filetype(filetype=None, url=url)


@test_media.INT_MAX_PARAMETERIZATION
def test_AudioStore_get_audio_validates_duration(value, expected):
audio_store = audio.AudioStore("test_provider")
Expand Down
Loading

0 comments on commit 6c7e0a6

Please sign in to comment.