From ae37bdd22ffb3716d98c912e3d31a0986fd14272 Mon Sep 17 00:00:00 2001 From: Ishaan Shah Date: Tue, 23 Jun 2020 22:10:46 +0530 Subject: [PATCH] LB-629: Prefer MBID over MSID if MBID is present during stats calculation. (#926) * Ignore msid if mbid is present while calculating stats * Update tests --- data/model/user_entity.py | 2 +- data/model/user_release_stat.py | 2 +- listenbrainz_spark/stats/user/artist.py | 5 +- listenbrainz_spark/stats/user/recording.py | 15 +++- listenbrainz_spark/stats/user/release.py | 10 ++- .../stats/user/tests/test_artist.py | 9 ++- .../stats/user/tests/test_recording.py | 13 +++- .../stats/user/tests/test_release.py | 12 +++- .../testdata/user_top_artists.json | 14 ++-- .../testdata/user_top_recordings.json | 24 +++---- .../testdata/user_top_releases.json | 20 +++--- .../testdata/user_top_releases_empty.json | 70 ++++++------------- 12 files changed, 101 insertions(+), 95 deletions(-) diff --git a/data/model/user_entity.py b/data/model/user_entity.py index 14e6fd6eeb..5fc1db310c 100644 --- a/data/model/user_entity.py +++ b/data/model/user_entity.py @@ -15,5 +15,5 @@ class UserEntityStatMessage(pydantic.BaseModel): stats_range: str # The range for which the stats are calculated, i.e week, month, year or all_time from_ts: int to_ts: int - data: List[Union[UserArtistRecord, UserReleaseRecord, UserRecordingRecord]] + data: List[Union[UserRecordingRecord, UserReleaseRecord, UserArtistRecord]] count: int diff --git a/data/model/user_release_stat.py b/data/model/user_release_stat.py index 477fa59e61..8205120dfe 100644 --- a/data/model/user_release_stat.py +++ b/data/model/user_release_stat.py @@ -11,7 +11,7 @@ class UserReleaseRecord(pydantic.BaseModel): artist_mbids: List[str] = [] release_mbid: Optional[str] release_msid: Optional[str] - release_name: Optional[str] # TODO: Make this required once https://tickets.metabrainz.org/browse/LB-621 is fixed + release_name: str listen_count: int artist_name: str diff --git a/listenbrainz_spark/stats/user/artist.py b/listenbrainz_spark/stats/user/artist.py index e387193665..3c36b927f5 100644 --- a/listenbrainz_spark/stats/user/artist.py +++ b/listenbrainz_spark/stats/user/artist.py @@ -25,7 +25,10 @@ def get_artists(table): result = run_query(""" SELECT user_name , artist_name - , nullif(artist_msid, '') as artist_msid + , CASE + WHEN cardinality(artist_mbids) > 0 THEN NULL + ELSE nullif(artist_msid, '') + END as artist_msid , artist_mbids , count(artist_name) as listen_count FROM {table} diff --git a/listenbrainz_spark/stats/user/recording.py b/listenbrainz_spark/stats/user/recording.py index 88ee0a7ff7..4630b5a6c5 100644 --- a/listenbrainz_spark/stats/user/recording.py +++ b/listenbrainz_spark/stats/user/recording.py @@ -31,13 +31,22 @@ def get_recordings(table): result = run_query(""" SELECT user_name , track_name - , nullif(recording_msid, '') as recording_msid + , CASE + WHEN recording_mbid IS NOT NULL AND recording_mbid != '' THEN NULL + ELSE nullif(recording_msid, '') + END as recording_msid , nullif(recording_mbid, '') as recording_mbid , artist_name - , nullif(artist_msid, '') as artist_msid + , CASE + WHEN cardinality(artist_mbids) > 0 THEN NULL + ELSE nullif(artist_msid, '') + END as artist_msid , artist_mbids , nullif(release_name, '') as release_name - , nullif(release_msid, '') as release_msid + , CASE + WHEN release_mbid IS NOT NULL AND release_mbid != '' THEN NULL + ELSE nullif(release_msid, '') + END as release_msid , nullif(release_mbid, '') as release_mbid , count(track_name) as listen_count FROM {} diff --git a/listenbrainz_spark/stats/user/release.py b/listenbrainz_spark/stats/user/release.py index 1cedee3b2d..8f7bf7247a 100644 --- a/listenbrainz_spark/stats/user/release.py +++ b/listenbrainz_spark/stats/user/release.py @@ -29,10 +29,16 @@ def get_releases(table): result = run_query(""" SELECT user_name , nullif(release_name, '') as release_name - , nullif(release_msid, '') as release_msid + , CASE + WHEN release_mbid IS NOT NULL AND release_mbid != '' THEN NULL + ELSE nullif(release_msid, '') + END as release_msid , nullif(release_mbid, '') as release_mbid , artist_name - , nullif(artist_msid, '') as artist_msid + , CASE + WHEN cardinality(artist_mbids) > 0 THEN NULL + ELSE nullif(artist_msid, '') + END as artist_msid , artist_mbids , count(release_name) as listen_count FROM {} diff --git a/listenbrainz_spark/stats/user/tests/test_artist.py b/listenbrainz_spark/stats/user/tests/test_artist.py index 5ee94974dd..7ae5640140 100644 --- a/listenbrainz_spark/stats/user/tests/test_artist.py +++ b/listenbrainz_spark/stats/user/tests/test_artist.py @@ -4,10 +4,13 @@ from datetime import datetime import listenbrainz_spark.stats.user.artist as artist_stats +from data.model.user_artist_stat import UserArtistRecord from listenbrainz_spark import utils from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY from listenbrainz_spark.tests import SparkTestCase from pyspark.sql import Row +from pyspark.sql.types import (ArrayType, StringType, StructField, + StructType) class ArtistTestCase(SparkTestCase): @@ -25,13 +28,15 @@ def save_dataframe(self): with open(self.path_to_data_file('user_top_artists.json')) as f: data = json.load(f) + schema = StructType((StructField('user_name', StringType()), StructField('artist_name', StringType()), + StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType())))) df = None for entry in data: for idx in range(0, entry['count']): # Assign listened_at to each listen row = utils.create_dataframe(Row(user_name=entry['user_name'], artist_name=entry['artist_name'], artist_msid=entry['artist_msid'], artist_mbids=entry['artist_mbids']), - schema=None) + schema=schema) df = df.union(row) if df else row utils.save_parquet(df, os.path.join(self.path_, '{}/{}.parquet'.format(now.year, now.month))) @@ -48,7 +53,7 @@ def test_get_artists(self): for entry in data: expected[entry['user_name']].append({ 'artist_name': entry['artist_name'], - 'artist_msid': entry['artist_msid'] or None, + 'artist_msid': entry['artist_msid'] or None if len(entry['artist_mbids']) == 0 else None, 'artist_mbids': entry['artist_mbids'], 'listen_count': entry['count'] }) diff --git a/listenbrainz_spark/stats/user/tests/test_recording.py b/listenbrainz_spark/stats/user/tests/test_recording.py index 2d365c469b..baed4bc0a8 100644 --- a/listenbrainz_spark/stats/user/tests/test_recording.py +++ b/listenbrainz_spark/stats/user/tests/test_recording.py @@ -8,6 +8,8 @@ from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY from listenbrainz_spark.tests import SparkTestCase from pyspark.sql import Row +from pyspark.sql.types import (ArrayType, StringType, StructField, + StructType) class RecordingTestCase(SparkTestCase): @@ -25,6 +27,11 @@ def save_dataframe(self, filename): with open(self.path_to_data_file(filename)) as f: data = json.load(f) + schema = StructType((StructField('user_name', StringType()), StructField('artist_name', StringType()), + StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType())), + StructField('release_name', StringType()), StructField('release_msid', StringType()), + StructField('release_mbid', StringType()), StructField('track_name', StringType()), + StructField('recording_mbid', StringType()), StructField('recording_msid', StringType()))) df = None for entry in data: for idx in range(0, entry['count']): @@ -51,13 +58,13 @@ def test_get_recordings(self): for entry in data: expected[entry['user_name']].append({ 'track_name': entry['track_name'], - 'recording_msid': entry['recording_msid'] or None, + 'recording_msid': entry['recording_msid'] or None if entry['recording_mbid'] == "" else None, 'recording_mbid': entry['recording_mbid'] or None, 'release_name': entry['release_name'] or None, - 'release_msid': entry['release_msid'] or None, + 'release_msid': entry['release_msid'] or None if entry['release_mbid'] == "" else None, 'release_mbid': entry['release_mbid'] or None, 'artist_name': entry['artist_name'], - 'artist_msid': entry['artist_msid'] or None, + 'artist_msid': entry['artist_msid'] or None if len(entry['artist_mbids']) == 0 else None, 'artist_mbids': entry['artist_mbids'], 'listen_count': entry['count'] }) diff --git a/listenbrainz_spark/stats/user/tests/test_release.py b/listenbrainz_spark/stats/user/tests/test_release.py index f1cb709d4a..7f7c1754bf 100644 --- a/listenbrainz_spark/stats/user/tests/test_release.py +++ b/listenbrainz_spark/stats/user/tests/test_release.py @@ -8,6 +8,8 @@ from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY from listenbrainz_spark.tests import SparkTestCase from pyspark.sql import Row +from pyspark.sql.types import (ArrayType, StringType, StructField, + StructType) class releaseTestCase(SparkTestCase): @@ -25,6 +27,10 @@ def save_dataframe(self, filename): with open(self.path_to_data_file(filename)) as f: data = json.load(f) + schema = StructType((StructField('user_name', StringType()), StructField('artist_name', StringType()), + StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType())), + StructField('release_name', StringType()), StructField('release_msid', StringType()), + StructField('release_mbid', StringType()))) df = None for entry in data: for idx in range(0, entry['count']): @@ -33,7 +39,7 @@ def save_dataframe(self, filename): release_msid=entry['release_msid'], release_mbid=entry['release_mbid'], artist_name=entry['artist_name'], artist_msid=entry['artist_msid'], artist_mbids=entry['artist_mbids']), - schema=None) + schema=schema) df = df.union(row) if df else row utils.save_parquet(df, os.path.join(self.path_, '{}/{}.parquet'.format(now.year, now.month))) @@ -51,10 +57,10 @@ def test_get_releases(self): if entry['release_name'] != '': expected[entry['user_name']].append({ 'release_name': entry['release_name'], - 'release_msid': entry['release_msid'] or None, + 'release_msid': entry['release_msid'] or None if entry['release_mbid'] == "" else None, 'release_mbid': entry['release_mbid'] or None, 'artist_name': entry['artist_name'], - 'artist_msid': entry['artist_msid'] or None, + 'artist_msid': entry['artist_msid'] or None if len(entry['artist_mbids']) == 0 else None, 'artist_mbids': entry['artist_mbids'], 'listen_count': entry['count'] }) diff --git a/listenbrainz_spark/testdata/user_top_artists.json b/listenbrainz_spark/testdata/user_top_artists.json index eb7c447d5a..d84c557093 100644 --- a/listenbrainz_spark/testdata/user_top_artists.json +++ b/listenbrainz_spark/testdata/user_top_artists.json @@ -3,49 +3,49 @@ "user_name": "user1", "artist_name": "artist1", "artist_msid": "1", - "artist_mbids": "1", + "artist_mbids": [], "count": 8 }, { "user_name": "user1", "artist_name": "artist2", "artist_msid": "", - "artist_mbids": "2", + "artist_mbids": ["2"], "count": 3 }, { "user_name": "user1", "artist_name": "artist3", "artist_msid": "3", - "artist_mbids": "3", + "artist_mbids": ["3"], "count": 4 }, { "user_name": "user2", "artist_name": "artist1", "artist_msid": "", - "artist_mbids": "1", + "artist_mbids": [""], "count": 4 }, { "user_name": "user2", "artist_name": "artist3", "artist_msid": "3", - "artist_mbids": "3", + "artist_mbids": ["3"], "count": 7 }, { "user_name": "user2", "artist_name": "artist4", "artist_msid": "", - "artist_mbids": "4", + "artist_mbids": ["4"], "count": 2 }, { "user_name": "user3", "artist_name": "artist1", "artist_msid": "1", - "artist_mbids": "1", + "artist_mbids": [], "count": 2 } ] diff --git a/listenbrainz_spark/testdata/user_top_recordings.json b/listenbrainz_spark/testdata/user_top_recordings.json index 91fcc3b80e..1fb2332892 100644 --- a/listenbrainz_spark/testdata/user_top_recordings.json +++ b/listenbrainz_spark/testdata/user_top_recordings.json @@ -6,10 +6,10 @@ "recording_mbid": "1", "release_name": "release1", "release_msid": "1", - "release_mbid": "1", + "release_mbid": "", "artist_name": "artist1", "artist_msid": "1", - "artist_mbids": "1", + "artist_mbids": ["1"], "count": 8 }, { @@ -22,20 +22,20 @@ "release_mbid": "", "artist_name": "artist1", "artist_msid": "", - "artist_mbids": "1", + "artist_mbids": ["1"], "count": 2 }, { "user_name": "user1", "track_name": "track2", "recording_msid": "2", - "recording_mbid": "2", + "recording_mbid": "", "release_name": "release2", "release_msid": "2", "release_mbid": "2", "artist_name": "artist2", "artist_msid": "2", - "artist_mbids": "2", + "artist_mbids": ["2"], "count": 3 }, { @@ -48,7 +48,7 @@ "release_mbid": "", "artist_name": "artist3", "artist_msid": "", - "artist_mbids": "3", + "artist_mbids": ["3"], "count": 4 }, { @@ -61,7 +61,7 @@ "release_mbid": "1", "artist_name": "artist1", "artist_msid": "", - "artist_mbids": "1", + "artist_mbids": ["1"], "count": 4 }, { @@ -74,7 +74,7 @@ "release_mbid": "", "artist_name": "artist3", "artist_msid": "", - "artist_mbids": "3", + "artist_mbids": ["3"], "count": 7 }, { @@ -84,10 +84,10 @@ "recording_mbid": "4", "release_name": "release4", "release_msid": "4", - "release_mbid": "4", + "release_mbid": "", "artist_name": "artist4", "artist_msid": "4", - "artist_mbids": "4", + "artist_mbids": ["4"], "count": 2 }, { @@ -100,7 +100,7 @@ "release_mbid": "", "artist_name": "artist1", "artist_msid": "1", - "artist_mbids": "1", + "artist_mbids": [""], "count": 2 }, { @@ -113,7 +113,7 @@ "release_mbid": "1", "artist_name": "artist1", "artist_msid": "", - "artist_mbids": "1", + "artist_mbids": ["1"], "count": 1 } ] diff --git a/listenbrainz_spark/testdata/user_top_releases.json b/listenbrainz_spark/testdata/user_top_releases.json index f51cbafbb0..d54a32709a 100644 --- a/listenbrainz_spark/testdata/user_top_releases.json +++ b/listenbrainz_spark/testdata/user_top_releases.json @@ -6,7 +6,7 @@ "release_mbid": "1", "artist_name": "artist1", "artist_msid": "1", - "artist_mbids": "1", + "artist_mbids": ["1"], "count": 8 }, { @@ -16,7 +16,7 @@ "release_mbid": "2", "artist_name": "artist2", "artist_msid": "", - "artist_mbids": "2", + "artist_mbids": [], "count": 3 }, { @@ -26,17 +26,17 @@ "release_mbid": "", "artist_name": "artist3", "artist_msid": "", - "artist_mbids": "3", + "artist_mbids": ["3"], "count": 4 }, { "user_name": "user2", "release_name": "release1", "release_msid": "1", - "release_mbid": "1", + "release_mbid": "", "artist_name": "artist1", "artist_msid": "", - "artist_mbids": "1", + "artist_mbids": ["1"], "count": 4 }, { @@ -46,7 +46,7 @@ "release_mbid": "3", "artist_name": "artist3", "artist_msid": "3", - "artist_mbids": "3", + "artist_mbids": [], "count": 7 }, { @@ -56,7 +56,7 @@ "release_mbid": "", "artist_name": "artist4", "artist_msid": "", - "artist_mbids": "", + "artist_mbids": [], "count": 2 }, { @@ -66,7 +66,7 @@ "release_mbid": "1", "artist_name": "artist1", "artist_msid": "1", - "artist_mbids": "1", + "artist_mbids": ["1"], "count": 2 }, { @@ -75,8 +75,8 @@ "release_msid": "", "release_mbid": "", "artist_name": "artist1", - "artist_msid": "1", - "artist_mbids": "1", + "artist_msid": "", + "artist_mbids": ["1"], "count": 2 } ] diff --git a/listenbrainz_spark/testdata/user_top_releases_empty.json b/listenbrainz_spark/testdata/user_top_releases_empty.json index f1b5d65163..c57048d21c 100644 --- a/listenbrainz_spark/testdata/user_top_releases_empty.json +++ b/listenbrainz_spark/testdata/user_top_releases_empty.json @@ -2,81 +2,51 @@ { "user_name": "user1", "release_name": "", - "release_msid": 1, - "release_mbid": 1, + "release_msid": "1", + "release_mbid": "1", "artist_name": "artist1", - "artist_msid": 1, - "artist_mbids": 1, + "artist_msid": "1", + "artist_mbids": ["1"], "count": 8 }, { "user_name": "user1", "release_name": "", - "release_msid": 2, - "release_mbid": 2, + "release_msid": "2", + "release_mbid": "2", "artist_name": "artist2", - "artist_msid": 2, - "artist_mbids": 2, + "artist_msid": "2", + "artist_mbids": ["2"], "count": 3 }, { "user_name": "user1", "release_name": "", - "release_msid": 3, - "release_mbid": 3, + "release_msid": "3", + "release_mbid": "3", "artist_name": "artist3", - "artist_msid": 3, - "artist_mbids": 3, + "artist_msid": "3", + "artist_mbids": ["3"], "count": 4 }, { "user_name": "user2", "release_name": "", - "release_msid": 1, - "release_mbid": 1, - "artist_name": "artist1", - "artist_msid": 1, - "artist_mbids": 1, - "count": 4 - }, - { - "user_name": "user2", - "release_name": "", - "release_msid": 3, - "release_mbid": 3, + "release_msid": "3", + "release_mbid": "3", "artist_name": "artist3", - "artist_msid": 3, - "artist_mbids": 3, + "artist_msid": "3", + "artist_mbids": ["3"], "count": 7 }, - { - "user_name": "user2", - "release_name": "", - "release_msid": 4, - "release_mbid": 4, - "artist_name": "artist4", - "artist_msid": 4, - "artist_mbids": 4, - "count": 2 - }, - { - "user_name": "user3", - "release_name": "", - "release_msid": 1, - "release_mbid": 1, - "artist_name": "artist1", - "artist_msid": 1, - "artist_mbids": 1, - "count": 2 - }, { "user_name": "user3", "release_name": "", - "release_msid": 1, - "release_mbid": 1, + "release_msid": "1", + "release_mbid": "1", "artist_name": "artist1", - "artist_msid": 1, - "artist_mbids": 1, + "artist_msid": "1", + "artist_mbids": ["1"], "count": 2 } ]