Skip to content

Commit

Permalink
LB-629: Prefer MBID over MSID if MBID is present during stats calcula…
Browse files Browse the repository at this point in the history
…tion. (#926)

* Ignore msid if mbid is present while calculating stats

* Update tests
  • Loading branch information
ishaanshah authored Jun 23, 2020
1 parent 684f293 commit ae37bdd
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 95 deletions.
2 changes: 1 addition & 1 deletion data/model/user_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ class UserEntityStatMessage(pydantic.BaseModel):
stats_range: str # The range for which the stats are calculated, i.e week, month, year or all_time
from_ts: int
to_ts: int
data: List[Union[UserArtistRecord, UserReleaseRecord, UserRecordingRecord]]
data: List[Union[UserRecordingRecord, UserReleaseRecord, UserArtistRecord]]
count: int
2 changes: 1 addition & 1 deletion data/model/user_release_stat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class UserReleaseRecord(pydantic.BaseModel):
artist_mbids: List[str] = []
release_mbid: Optional[str]
release_msid: Optional[str]
release_name: Optional[str] # TODO: Make this required once https://tickets.metabrainz.org/browse/LB-621 is fixed
release_name: str
listen_count: int
artist_name: str

Expand Down
5 changes: 4 additions & 1 deletion listenbrainz_spark/stats/user/artist.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ def get_artists(table):
result = run_query("""
SELECT user_name
, artist_name
, nullif(artist_msid, '') as artist_msid
, CASE
WHEN cardinality(artist_mbids) > 0 THEN NULL
ELSE nullif(artist_msid, '')
END as artist_msid
, artist_mbids
, count(artist_name) as listen_count
FROM {table}
Expand Down
15 changes: 12 additions & 3 deletions listenbrainz_spark/stats/user/recording.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,22 @@ def get_recordings(table):
result = run_query("""
SELECT user_name
, track_name
, nullif(recording_msid, '') as recording_msid
, CASE
WHEN recording_mbid IS NOT NULL AND recording_mbid != '' THEN NULL
ELSE nullif(recording_msid, '')
END as recording_msid
, nullif(recording_mbid, '') as recording_mbid
, artist_name
, nullif(artist_msid, '') as artist_msid
, CASE
WHEN cardinality(artist_mbids) > 0 THEN NULL
ELSE nullif(artist_msid, '')
END as artist_msid
, artist_mbids
, nullif(release_name, '') as release_name
, nullif(release_msid, '') as release_msid
, CASE
WHEN release_mbid IS NOT NULL AND release_mbid != '' THEN NULL
ELSE nullif(release_msid, '')
END as release_msid
, nullif(release_mbid, '') as release_mbid
, count(track_name) as listen_count
FROM {}
Expand Down
10 changes: 8 additions & 2 deletions listenbrainz_spark/stats/user/release.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,16 @@ def get_releases(table):
result = run_query("""
SELECT user_name
, nullif(release_name, '') as release_name
, nullif(release_msid, '') as release_msid
, CASE
WHEN release_mbid IS NOT NULL AND release_mbid != '' THEN NULL
ELSE nullif(release_msid, '')
END as release_msid
, nullif(release_mbid, '') as release_mbid
, artist_name
, nullif(artist_msid, '') as artist_msid
, CASE
WHEN cardinality(artist_mbids) > 0 THEN NULL
ELSE nullif(artist_msid, '')
END as artist_msid
, artist_mbids
, count(release_name) as listen_count
FROM {}
Expand Down
9 changes: 7 additions & 2 deletions listenbrainz_spark/stats/user/tests/test_artist.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
from datetime import datetime

import listenbrainz_spark.stats.user.artist as artist_stats
from data.model.user_artist_stat import UserArtistRecord
from listenbrainz_spark import utils
from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY
from listenbrainz_spark.tests import SparkTestCase
from pyspark.sql import Row
from pyspark.sql.types import (ArrayType, StringType, StructField,
StructType)


class ArtistTestCase(SparkTestCase):
Expand All @@ -25,13 +28,15 @@ def save_dataframe(self):
with open(self.path_to_data_file('user_top_artists.json')) as f:
data = json.load(f)

schema = StructType((StructField('user_name', StringType()), StructField('artist_name', StringType()),
StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType()))))
df = None
for entry in data:
for idx in range(0, entry['count']):
# Assign listened_at to each listen
row = utils.create_dataframe(Row(user_name=entry['user_name'], artist_name=entry['artist_name'],
artist_msid=entry['artist_msid'], artist_mbids=entry['artist_mbids']),
schema=None)
schema=schema)
df = df.union(row) if df else row

utils.save_parquet(df, os.path.join(self.path_, '{}/{}.parquet'.format(now.year, now.month)))
Expand All @@ -48,7 +53,7 @@ def test_get_artists(self):
for entry in data:
expected[entry['user_name']].append({
'artist_name': entry['artist_name'],
'artist_msid': entry['artist_msid'] or None,
'artist_msid': entry['artist_msid'] or None if len(entry['artist_mbids']) == 0 else None,
'artist_mbids': entry['artist_mbids'],
'listen_count': entry['count']
})
Expand Down
13 changes: 10 additions & 3 deletions listenbrainz_spark/stats/user/tests/test_recording.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY
from listenbrainz_spark.tests import SparkTestCase
from pyspark.sql import Row
from pyspark.sql.types import (ArrayType, StringType, StructField,
StructType)


class RecordingTestCase(SparkTestCase):
Expand All @@ -25,6 +27,11 @@ def save_dataframe(self, filename):
with open(self.path_to_data_file(filename)) as f:
data = json.load(f)

schema = StructType((StructField('user_name', StringType()), StructField('artist_name', StringType()),
StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType())),
StructField('release_name', StringType()), StructField('release_msid', StringType()),
StructField('release_mbid', StringType()), StructField('track_name', StringType()),
StructField('recording_mbid', StringType()), StructField('recording_msid', StringType())))
df = None
for entry in data:
for idx in range(0, entry['count']):
Expand All @@ -51,13 +58,13 @@ def test_get_recordings(self):
for entry in data:
expected[entry['user_name']].append({
'track_name': entry['track_name'],
'recording_msid': entry['recording_msid'] or None,
'recording_msid': entry['recording_msid'] or None if entry['recording_mbid'] == "" else None,
'recording_mbid': entry['recording_mbid'] or None,
'release_name': entry['release_name'] or None,
'release_msid': entry['release_msid'] or None,
'release_msid': entry['release_msid'] or None if entry['release_mbid'] == "" else None,
'release_mbid': entry['release_mbid'] or None,
'artist_name': entry['artist_name'],
'artist_msid': entry['artist_msid'] or None,
'artist_msid': entry['artist_msid'] or None if len(entry['artist_mbids']) == 0 else None,
'artist_mbids': entry['artist_mbids'],
'listen_count': entry['count']
})
Expand Down
12 changes: 9 additions & 3 deletions listenbrainz_spark/stats/user/tests/test_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY
from listenbrainz_spark.tests import SparkTestCase
from pyspark.sql import Row
from pyspark.sql.types import (ArrayType, StringType, StructField,
StructType)


class releaseTestCase(SparkTestCase):
Expand All @@ -25,6 +27,10 @@ def save_dataframe(self, filename):
with open(self.path_to_data_file(filename)) as f:
data = json.load(f)

schema = StructType((StructField('user_name', StringType()), StructField('artist_name', StringType()),
StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType())),
StructField('release_name', StringType()), StructField('release_msid', StringType()),
StructField('release_mbid', StringType())))
df = None
for entry in data:
for idx in range(0, entry['count']):
Expand All @@ -33,7 +39,7 @@ def save_dataframe(self, filename):
release_msid=entry['release_msid'], release_mbid=entry['release_mbid'],
artist_name=entry['artist_name'], artist_msid=entry['artist_msid'],
artist_mbids=entry['artist_mbids']),
schema=None)
schema=schema)
df = df.union(row) if df else row

utils.save_parquet(df, os.path.join(self.path_, '{}/{}.parquet'.format(now.year, now.month)))
Expand All @@ -51,10 +57,10 @@ def test_get_releases(self):
if entry['release_name'] != '':
expected[entry['user_name']].append({
'release_name': entry['release_name'],
'release_msid': entry['release_msid'] or None,
'release_msid': entry['release_msid'] or None if entry['release_mbid'] == "" else None,
'release_mbid': entry['release_mbid'] or None,
'artist_name': entry['artist_name'],
'artist_msid': entry['artist_msid'] or None,
'artist_msid': entry['artist_msid'] or None if len(entry['artist_mbids']) == 0 else None,
'artist_mbids': entry['artist_mbids'],
'listen_count': entry['count']
})
Expand Down
14 changes: 7 additions & 7 deletions listenbrainz_spark/testdata/user_top_artists.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,49 @@
"user_name": "user1",
"artist_name": "artist1",
"artist_msid": "1",
"artist_mbids": "1",
"artist_mbids": [],
"count": 8
},
{
"user_name": "user1",
"artist_name": "artist2",
"artist_msid": "",
"artist_mbids": "2",
"artist_mbids": ["2"],
"count": 3
},
{
"user_name": "user1",
"artist_name": "artist3",
"artist_msid": "3",
"artist_mbids": "3",
"artist_mbids": ["3"],
"count": 4
},
{
"user_name": "user2",
"artist_name": "artist1",
"artist_msid": "",
"artist_mbids": "1",
"artist_mbids": [""],
"count": 4
},
{
"user_name": "user2",
"artist_name": "artist3",
"artist_msid": "3",
"artist_mbids": "3",
"artist_mbids": ["3"],
"count": 7
},
{
"user_name": "user2",
"artist_name": "artist4",
"artist_msid": "",
"artist_mbids": "4",
"artist_mbids": ["4"],
"count": 2
},
{
"user_name": "user3",
"artist_name": "artist1",
"artist_msid": "1",
"artist_mbids": "1",
"artist_mbids": [],
"count": 2
}
]
24 changes: 12 additions & 12 deletions listenbrainz_spark/testdata/user_top_recordings.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
"recording_mbid": "1",
"release_name": "release1",
"release_msid": "1",
"release_mbid": "1",
"release_mbid": "",
"artist_name": "artist1",
"artist_msid": "1",
"artist_mbids": "1",
"artist_mbids": ["1"],
"count": 8
},
{
Expand All @@ -22,20 +22,20 @@
"release_mbid": "",
"artist_name": "artist1",
"artist_msid": "",
"artist_mbids": "1",
"artist_mbids": ["1"],
"count": 2
},
{
"user_name": "user1",
"track_name": "track2",
"recording_msid": "2",
"recording_mbid": "2",
"recording_mbid": "",
"release_name": "release2",
"release_msid": "2",
"release_mbid": "2",
"artist_name": "artist2",
"artist_msid": "2",
"artist_mbids": "2",
"artist_mbids": ["2"],
"count": 3
},
{
Expand All @@ -48,7 +48,7 @@
"release_mbid": "",
"artist_name": "artist3",
"artist_msid": "",
"artist_mbids": "3",
"artist_mbids": ["3"],
"count": 4
},
{
Expand All @@ -61,7 +61,7 @@
"release_mbid": "1",
"artist_name": "artist1",
"artist_msid": "",
"artist_mbids": "1",
"artist_mbids": ["1"],
"count": 4
},
{
Expand All @@ -74,7 +74,7 @@
"release_mbid": "",
"artist_name": "artist3",
"artist_msid": "",
"artist_mbids": "3",
"artist_mbids": ["3"],
"count": 7
},
{
Expand All @@ -84,10 +84,10 @@
"recording_mbid": "4",
"release_name": "release4",
"release_msid": "4",
"release_mbid": "4",
"release_mbid": "",
"artist_name": "artist4",
"artist_msid": "4",
"artist_mbids": "4",
"artist_mbids": ["4"],
"count": 2
},
{
Expand All @@ -100,7 +100,7 @@
"release_mbid": "",
"artist_name": "artist1",
"artist_msid": "1",
"artist_mbids": "1",
"artist_mbids": [""],
"count": 2
},
{
Expand All @@ -113,7 +113,7 @@
"release_mbid": "1",
"artist_name": "artist1",
"artist_msid": "",
"artist_mbids": "1",
"artist_mbids": ["1"],
"count": 1
}
]
Loading

0 comments on commit ae37bdd

Please sign in to comment.