Skip to content

Commit

Permalink
Add soundcloud metadata index (#2929)
Browse files Browse the repository at this point in the history
* Soundcloud Metadata Cache Improvements

Use endpoints that provide a richer set of attributes for tracks.

* Add release_year, release_month and release_day columns

* Add soundcloud metadata index

Create soundcloud metadata index considering that it does not have albums.
  • Loading branch information
amCap1712 authored Jul 8, 2024
1 parent 62e14e4 commit eefd1ad
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 79 deletions.
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import uuid

import psycopg2
from psycopg2.extras import execute_values
from datasethoster import Query
from flask import current_app
from pydantic import BaseModel

from listenbrainz.labs_api.labs.api.spotify import SpotifyIdFromMBIDOutput
from listenbrainz.labs_api.labs.api.utils import lookup_using_metadata
from listenbrainz.db.recording import resolve_redirect_mbids, resolve_canonical_mbids
from listenbrainz.labs_api.labs.api.utils import lookup_using_metadata, lookup_recording_canonical_metadata


class MetadataIdFromMBIDInput(BaseModel):
Expand All @@ -25,49 +20,7 @@ def __init__(self, name):
def inputs(self):
return MetadataIdFromMBIDInput

def fetch_metadata_from_mbids(self, curs, mbids):
""" Retrieve metadata from canonical tables for given mbids. Note that all mbids should be canonical mbids
otherwise metadata may not be found. """
query = """
WITH mbids(gid) AS (VALUES %s)
SELECT recording_mbid::TEXT
, COALESCE(recording_name, '')
, COALESCE(artist_credit_name, '')
, COALESCE(release_name, '')
FROM mapping.canonical_musicbrainz_data
RIGHT JOIN mbids
ON recording_mbid = gid::UUID
"""
execute_values(curs, query, [(mbid,) for mbid in mbids], page_size=len(mbids))

metadata = {}
for row in curs.fetchall():
metadata[row[0]] = {
"track_name": row[1],
"artist_name": row[2],
"release_name": row[3]
}
return metadata

def fetch(self, params, source, offset=-1, count=-1):
mbids = [str(p.recording_mbid) for p in params]

with psycopg2.connect(current_app.config["MB_DATABASE_URI"]) as conn, conn.cursor() as curs:
redirected_mbids, redirect_index, _ = resolve_redirect_mbids(curs, "recording", mbids)

with psycopg2.connect(current_app.config["SQLALCHEMY_TIMESCALE_URI"]) as conn, conn.cursor() as curs:
canonical_mbids, canonical_index, _ = resolve_canonical_mbids(curs, redirected_mbids)
metadata = self.fetch_metadata_from_mbids(curs, canonical_mbids)

ordered_metadata = []
for mbid in mbids:
# check whether mbid was redirected before looking up metadata
redirected_mbid = redirect_index.get(mbid, mbid)
canonical_mbid = canonical_index.get(redirected_mbid, redirected_mbid)

mbid_metadata = metadata.get(canonical_mbid, {})
# regardless of whether we redirected the mbid, add the original mbid in the response returned to user
mbid_metadata["recording_mbid"] = mbid
ordered_metadata.append(mbid_metadata)

return lookup_using_metadata(ordered_metadata, self.name)
metadata = lookup_recording_canonical_metadata(mbids)
return lookup_using_metadata(metadata, self.name, self.outputs(), f"{self.name}_track_ids")
7 changes: 7 additions & 0 deletions listenbrainz/labs_api/labs/api/soundcloud/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from typing import Optional

from listenbrainz.labs_api.labs.api.metadata_index import BaseMetadataIndexOutput


class SoundCloudIdFromMBIDOutput(BaseMetadataIndexOutput):
soundcloud_track_ids: Optional[list[str]]
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from listenbrainz.labs_api.labs.api.metadata_index.metadata_index_from_mbid_lookup import MetadataIndexFromMBIDQuery
from listenbrainz.labs_api.labs.api.soundcloud import SoundCloudIdFromMBIDOutput


class SoundCloudIdFromMBIDQuery(MetadataIndexFromMBIDQuery):
""" Query to lookup soundcloud track ids using recording mbids. """

def __init__(self):
super().__init__("soundcloud")

def names(self):
return "soundcloud-id-from-mbid", "SoundCloud Track ID Lookup using recording mbid"

def introduction(self):
return """Given a recording mbid, lookup its metadata using canonical metadata
tables and using that attempt to find a suitable match in SoundCloud."""

def outputs(self):
return SoundCloudIdFromMBIDOutput
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from listenbrainz.labs_api.labs.api.metadata_index.metadata_index_from_metadata_lookup import \
MetadataIndexFromMetadataQuery
from listenbrainz.labs_api.labs.api.soundcloud import SoundCloudIdFromMBIDOutput


class SoundCloudIdFromMetadataQuery(MetadataIndexFromMetadataQuery):
""" Query to lookup soundcloud track ids using artist name, release name and track name. """


def __init__(self):
super().__init__("soundcloud")

def names(self):
return "soundcloud-id-from-metadata", "SoundCloud Track ID Lookup using metadata"

def introduction(self):
return """Given the name of an artist, the name of a release and the name of a recording (track)
this query will attempt to find a suitable match in SoundCloud."""

def outputs(self):
return SoundCloudIdFromMBIDOutput
120 changes: 91 additions & 29 deletions listenbrainz/labs_api/labs/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import psycopg2

from flask import current_app
from pydantic import BaseModel
from unidecode import unidecode
from psycopg2.extras import execute_values
from psycopg2.sql import SQL, Identifier

from listenbrainz.labs_api.labs.api.spotify import SpotifyIdFromMBIDOutput
from listenbrainz.labs_api.labs.api.apple import AppleMusicIdFromMBIDOutput
from listenbrainz.db.recording import resolve_redirect_mbids, resolve_canonical_mbids


class LookupType(Enum):
Expand All @@ -29,30 +29,30 @@ def query_combined_lookup(column: LookupType, lookups: list[tuple], service):
""" Lookup track ids for the given lookups in the metadata index using the specified lookup type"""
if service == 'spotify':
table = 'mapping.spotify_metadata_index'
track_ids = 'spotify_track_ids'
elif service == 'apple_music':
table = 'mapping.apple_metadata_index'
track_ids = 'apple_music_track_ids'
elif service == 'soundcloud':
table = 'mapping.soundcloud_metadata_index'
else:
raise ValueError("Service must be either 'spotify' or 'apple_music'")
raise ValueError("Service must be either 'spotify', 'apple_music' or 'soundcloud'")

query = SQL("""
WITH lookups (idx, value) AS (VALUES %s)
SELECT DISTINCT ON ({column})
idx, array_agg(track_id ORDER BY score DESC) AS {track_ids}
idx, array_agg(track_id ORDER BY score DESC) AS track_ids
FROM lookups
JOIN {table}
ON {column} = value
GROUP BY {column}, idx
""").format(column=Identifier(column.value), table=SQL(table), track_ids=Identifier(track_ids))
""").format(column=Identifier(column.value), table=SQL(table))

with psycopg2.connect(current_app.config["SQLALCHEMY_TIMESCALE_URI"]) as conn, conn.cursor() as curs:
execute_values(curs, query, lookups, page_size=len(lookups))
result = curs.fetchall()
return {row[0]: row[1] for row in result}


def perform_lookup(column, metadata, generate_lookup, service):
def perform_lookup(column, metadata, generate_lookup, service, track_id_field):
""" Given the lookup type and a function to generate to the lookup text, query database for external service track ids """
if not metadata:
return metadata, {}
Expand All @@ -69,10 +69,7 @@ def perform_lookup(column, metadata, generate_lookup, service):
for idx, item in metadata.items():
track_ids = index.get(idx)
if track_ids:
if service == 'spotify':
metadata[idx]["spotify_track_ids"] = track_ids
else:
metadata[idx]["apple_music_track_ids"] = track_ids
metadata[idx][track_id_field] = track_ids
else:
remaining_items[idx] = item

Expand All @@ -99,29 +96,94 @@ def combined_without_album_detuned(item) -> str:
return detune(item["artist_name"]) + item["track_name"]


def lookup_using_metadata(params: list[dict], service):
""" Given a list of dicts each having artist name, release name and track name, attempt to find spotify track
id for each. """
def lookup_using_metadata(params: list[dict], service, model: type[BaseModel], track_id_field: str):
""" Given a list of dicts each having artist name, release name and track name, attempt to find external service
track id for each. """
all_metadata, metadata = {}, {}
for idx, item in enumerate(params):
all_metadata[idx] = item
if "artist_name" in item and "track_name" in item:
metadata[idx] = item

# first attempt matching on artist, track and release followed by trying various detunings for unmatched recordings
_, remaining_items = perform_lookup(LookupType.ALL, metadata, combined_all, service)
_, remaining_items = perform_lookup(LookupType.ALL, remaining_items, combined_all_detuned, service)
_, remaining_items = perform_lookup(LookupType.WITHOUT_ALBUM, remaining_items, combined_without_album, service)
_, remaining_items = perform_lookup(LookupType.WITHOUT_ALBUM, remaining_items, combined_without_album_detuned, service)

# to the still unmatched recordings, add null value so that each item has in the response has spotify_track_id key
remaining_items = metadata
# soundcloud doesn't support albums
if service != "soundcloud":
_, remaining_items = perform_lookup(
LookupType.ALL,
remaining_items,
combined_all,
service,
track_id_field
)
_, remaining_items = perform_lookup(
LookupType.ALL,
remaining_items,
combined_all_detuned,
service,
track_id_field
)

_, remaining_items = perform_lookup(
LookupType.WITHOUT_ALBUM,
remaining_items,
combined_without_album,
service,
track_id_field
)
_, remaining_items = perform_lookup(
LookupType.WITHOUT_ALBUM,
remaining_items,
combined_without_album_detuned,
service,
track_id_field
)

# to the still unmatched recordings, add null value so that each item has in the response has the appropriate
# external service track ids key
for item in all_metadata.values():
if service == "spotify" and "spotify_track_ids" not in item:
item["spotify_track_ids"] = []
elif service == "apple_music" and "apple_music_track_ids" not in item:
item["apple_music_track_ids"] = []
if track_id_field not in item:
item[track_id_field] = []
return [model(**row) for row in metadata.values()]

if service == "spotify":
return [SpotifyIdFromMBIDOutput(**row) for row in metadata.values()]
else:
return [AppleMusicIdFromMBIDOutput(**row) for row in metadata.values()]

def lookup_recording_canonical_metadata(mbids: list[str]):
""" Retrieve metadata from canonical tables for given mbids. All mbids are first looked up in MB redirects
and then resolved to canonical mbids. Finally, the metadata for canonical mbids is retrieved. """
with psycopg2.connect(current_app.config["MB_DATABASE_URI"]) as conn, conn.cursor() as curs:
redirected_mbids, redirect_index, _ = resolve_redirect_mbids(curs, "recording", mbids)

with psycopg2.connect(current_app.config["SQLALCHEMY_TIMESCALE_URI"]) as conn, conn.cursor() as curs:
canonical_mbids, canonical_index, _ = resolve_canonical_mbids(curs, redirected_mbids)
query = """
WITH mbids(gid) AS (VALUES %s)
SELECT recording_mbid::TEXT
, COALESCE(recording_name, '')
, COALESCE(artist_credit_name, '')
, COALESCE(release_name, '')
FROM mapping.canonical_musicbrainz_data
RIGHT JOIN mbids
ON recording_mbid = gid::UUID
"""
execute_values(curs, query, [(mbid,) for mbid in mbids], page_size=len(mbids))

metadata = {}
for row in curs.fetchall():
metadata[row[0]] = {
"track_name": row[1],
"artist_name": row[2],
"release_name": row[3]
}

ordered_metadata = []
for mbid in mbids:
# check whether mbid was redirected before looking up metadata
redirected_mbid = redirect_index.get(mbid, mbid)
canonical_mbid = canonical_index.get(redirected_mbid, redirected_mbid)

mbid_metadata = metadata.get(canonical_mbid, {})
# regardless of whether we redirected the mbid, add the original mbid in the response returned to user
mbid_metadata["recording_mbid"] = mbid
ordered_metadata.append(mbid_metadata)

return ordered_metadata
4 changes: 4 additions & 0 deletions listenbrainz/labs_api/labs/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from listenbrainz.labs_api.labs.api.artist_credit_recording_lookup import ArtistCreditRecordingLookupQuery
from listenbrainz.labs_api.labs.api.similar_artists import SimilarArtistsViewerQuery
from listenbrainz.labs_api.labs.api.similar_recordings import SimilarRecordingsViewerQuery
from listenbrainz.labs_api.labs.api.soundcloud.soundcloud_from_mbid_lookup import SoundCloudIdFromMBIDQuery
from listenbrainz.labs_api.labs.api.soundcloud.soundcloud_from_metadata_lookup import SoundCloudIdFromMetadataQuery
from listenbrainz.labs_api.labs.api.spotify.spotify_mbid_lookup import SpotifyIdFromMBIDQuery
from listenbrainz.labs_api.labs.api.spotify.spotify_metadata_lookup import SpotifyIdFromMetadataQuery
from listenbrainz.labs_api.labs.api.user_listen_sessions import UserListensSessionQuery
Expand All @@ -38,6 +40,8 @@
register_query(SpotifyIdFromMBIDQuery())
register_query(AppleMusicIdFromMBIDQuery())
register_query(AppleMusicIdFromMetadataQuery())
register_query(SoundCloudIdFromMBIDQuery())
register_query(SoundCloudIdFromMetadataQuery())
register_query(UserListensSessionQuery())
register_query(SimilarRecordingsViewerQuery())
register_query(SimilarArtistsViewerQuery())
Expand Down
9 changes: 9 additions & 0 deletions mbid_mapping/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from mapping.canonical_musicbrainz_data import create_canonical_musicbrainz_data
from mapping.mb_artist_metadata_cache import create_mb_artist_metadata_cache, \
incremental_update_mb_artist_metadata_cache
from mapping.soundcloud_metadata_index import create_soundcloud_metadata_index
from mapping.typesense_index import build_all as action_build_index
from mapping.mapping_test.mapping_test import test_mapping as action_test_mapping
from mapping.utils import log, CRON_LOG_FILE
Expand Down Expand Up @@ -212,6 +213,14 @@ def build_apple_metadata_index(use_lb_conn):
create_apple_metadata_index(use_lb_conn)


@cli.command()
@click.option("--use-lb-conn/--use-mb-conn", default=True, help="whether to create the tables in LB or MB")
def build_apple_metadata_index(use_lb_conn):
"""
Build the Soundcloud Music metadata index that LB uses
"""
create_soundcloud_metadata_index(use_lb_conn)

@cli.command()
def build_tag_similarity():
"""
Expand Down
Loading

0 comments on commit eefd1ad

Please sign in to comment.