Skip to content

Commit

Permalink
add discovery mix
Browse files Browse the repository at this point in the history
  • Loading branch information
l-moamen committed Sep 26, 2023
1 parent baf4d84 commit 8ab2573
Show file tree
Hide file tree
Showing 42 changed files with 1,682 additions and 2 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Pypi publish

on:
push:
branches:
- main

jobs:
build-n-publish:
name: Build and publish Python distributions to PyPI
runs-on: ubuntu-latest

steps:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'

- name: Check out code
uses: actions/checkout@v2

- name: Upgrade pip
run: python -m pip install --upgrade pip

- name: Install poetry
run: pip install poetry==1.5.1

- name: Install dependencies
run: poetry install --no-root

# - name: Build package
# run: python3 -m build
#
# - name: Publish distribution to PyPI
# uses: pypa/gh-action-pypi-publish@release/v1
# with:
# password: ${{ secrets.PYPI_API_TOKEN }}
34 changes: 34 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Unit tests

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'

- name: Check out code
uses: actions/checkout@v2

- name: Upgrade pip
run: python -m pip install --upgrade pip

- name: Install poetry
run: pip install poetry==1.5.1

- name: Install dependencies
run: poetry install --no-root

- name: Run unit tests
run: PYTHONPATH=tidal_algorithmic_mixes:test poetry run pytest test/
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
/.python-version
/poetry.lock
*.pyc
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,17 @@
# tidal-algorithmic-mixes
Tidal algorithmic mixes
# Tidal algorithmic mixes

This contains the logic of how tidal create its algorithmic offline mixes,
how it utilizes different machine learning models,
alongside business rules to create different mixes for different use cases,
included personalized mixes (like my mix, my new arrivals and daily discovery)
and non-personalized like track radio and artist radio.

- Make sure you have pyenv and [pyenv](https://github.com/pyenv/pyenv) amd [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv) installed on your local environment.
- Install python 3.8.16 with pyenv `pyenv install 3.8.16`.
- Set up a new virtual env `pyenv virtualenv 3.8.16 mixes`
- Set local pyenv version `pyenv local mixes`
- Activate the virtual pyenv using `pyenv activate mixes`
- Upgrade the pip package installer `pip install --upgrade pip`
- Install poetry for package management `pip install poetry==1.5.1`
- Install dependencies from the lock file `poetry install --no-root`

53 changes: 53 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
[project]
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Python :: 3",
"License :: Apache License V 2.0",
"Operating System :: OS Independent",
]

[project.urls]
"GitHub" = "https://github.com/tidal-music/tidal-algorithmic-mixes"

[tool.poetry]
name = "tidal_algorithmic_mixes"
version = "0.0.1"
description = "common transformers used by the tidal personalization team."
authors = [
"Loay <[email protected]>",
"Jing <[email protected]>",
"Tao <[email protected]>",
"Thomas <[email protected]>",
"Yuhua [email protected]"
]

license = "Apache License V 2.0"
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.8.0"
pyspark = "3.4.0"
numpy = ">=1.16.4"
s3fs = "2022.11.0"
boto3 = "1.24.59"
pandas = ">=1.4.2"
great-expectations = "0.16.15"
scikit-learn = "1.1.1"
alphabet-detector = "0.0.7"
pyarrow = "7.0.0"
tidal-per-transformers = "0.0.4"
torch = "1.9.1"
mlflow = "2.1.1"

[tool.poetry.group.dev.dependencies]
pytest = "6.1.2"
coverage = ">=4.5.2"
pytest-cov = ">=2.6.1"
coveralls = ">=1.6.0"
mock = ">=2.0.0"
moto = ">=3.1.11"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Empty file added test/__init__.py
Empty file.
Empty file added test/discovery_mix/__init__.py
Empty file.
43 changes: 43 additions & 0 deletions test/discovery_mix/test_daily_update_transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from datetime import date

import tidal_algorithmic_mixes.utils.constants as c

from test.pyspark_test import PySparkTest
from tidal_algorithmic_mixes.discovery_mix.daily_update_transformation import DiscoveryMixDailyUpdateTransformation, \
DiscoveryMixDailyUpdateTransformationData


class DiscoveryMixDailyUpdateTransformationTestInterface(DiscoveryMixDailyUpdateTransformation):
def extract(self, *args, **kwargs):
...

def validate(self, *args, **kwargs):
...

def load(self, *args, **kwargs):
...


class DiscoveryMixDailyUpdateTest(PySparkTest):

def test_slicer(self):
mixes = self.spark.createDataFrame([
(0, [10, 11, 12, 13, 14, 15, 16]),
(1, [10, 11, 12, 13, 14, 15, 16]),
(2, [10, 11, 12, 13, 14, 15, 16]),
(3, [10, 11, 12, 13, 14, 15, 16])
], [c.USER, c.TRACKS])

runner = DiscoveryMixDailyUpdateTransformationTestInterface(self.spark)

runner._data = DiscoveryMixDailyUpdateTransformationData(mixes)

self.assertEqual(runner.slicer(mixes, date(2021, 2, 15), 1).collect()[0][c.TRACKS][0], 10)
self.assertEqual(runner.slicer(mixes, date(2021, 2, 18), 1).collect()[0][c.TRACKS][0], 13)
self.assertEqual(runner.slicer(mixes, date(2021, 2, 21), 1).collect()[0][c.TRACKS][0], 16)

def test_offset(self):
runner = DiscoveryMixDailyUpdateTransformationTestInterface(self.spark)
self.assertEqual(runner.offset(date(2021, 2, 15), 10), 0)
self.assertEqual(runner.offset(date(2021, 2, 18), 10), 30)
self.assertEqual(runner.offset(date(2021, 2, 21), 10), 60)
55 changes: 55 additions & 0 deletions test/discovery_mix/test_observed_tracks_aggregator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import tidal_algorithmic_mixes.utils.constants as c
from pyspark_test import PySparkTest
from tidal_algorithmic_mixes.discovery_mix.observed_tracks_aggregator_transformation import \
ObservedDiscoveryMixTracksAggregatorTransformation, ObservedDiscoveryMixTracksAggregatorTransformationData


class ObservedDiscoveryMixTracksAggregatorTransformationTestInterface(
ObservedDiscoveryMixTracksAggregatorTransformation):
def extract(self, *args, **kwargs):
...

def validate(self, *args, **kwargs):
...

def load(self, *args, **kwargs):
...


class ObservedDiscoveryMixTracksAggregatorTest(PySparkTest):

def setUp(self):
super().setUp()

def test_transform(self):
user_1 = 26129743
user_2 = 43727840

user_1_mix = "5b5b0f74b66cbecf46de5f00297"
user_2_mix = "c71e7c0b5f8daeaff1bdea48f9f"

tracks_user_1 = [1, 2, 3, 4, 5, 6]
tracks_user_2 = [3, 4, 5, 6, 7, 9]

mixes = self.spark.createDataFrame([
(user_1_mix,),
(user_2_mix,),
], [c.MIX_ID])

observed_mixes = self.spark.createDataFrame([
(user_1_mix, user_1, tracks_user_1),
(user_2_mix, user_2, tracks_user_2),
("xvxfewfwsdf34r3sf3jfaae4tgs", 1664, [11, 22, 33, 44]),
("a71e7xffw4rzdzdf34zsz23ead3", 1984, [55, 66, 77, 11, 22]),
], [c.MIX_ID, c.USER, c.TRACKS])

runner = ObservedDiscoveryMixTracksAggregatorTransformationTestInterface(self.spark)
runner._data = ObservedDiscoveryMixTracksAggregatorTransformationData(observed_mixes=observed_mixes,
mixes=mixes)
runner.transform()
res = runner.output.output

self.assertEqual(res.columns, [c.USER, c.TRACK_GROUP])
self.assertEqual(res.count(), len(tracks_user_1) + len(tracks_user_2))

self.assertEqual([user_1, user_2], ([x[c.USER] for x in res.select(c.USER).distinct().collect()]))
141 changes: 141 additions & 0 deletions test/discovery_mix/test_post_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
from datetime import datetime
from pyspark.sql.types import Row
from pyspark_test import PySparkTest
from tidal_algorithmic_mixes.discovery_mix.post_processor_transformation import\
DiscoveryMixPostProcessorTransformation, DiscoveryMixPostProcessorTransformationData


class DiscoveryMixPostProcessorTransformationTestInterface(DiscoveryMixPostProcessorTransformation):
def extract(self, *args, **kwargs):
...

def validate(self, *args, **kwargs):
...

def load(self, *args, **kwargs):
...


class DiscoveryMixPostProcessorTest(PySparkTest):

def setUp(self):
tracks_metadata = self.spark.createDataFrame([Row(id=1,
title='Chime again',
popularityWW=0,
trackNumber=16,
volumeNumber=1,
numAlbums=3,
explicit=False,
generatedFromVideo=False,
trackGroup='xxx',
audioQuality='LOSSLESS',
available=True,
version='x',
duration=192,
mixes={'x': 'y'},
mainArtistsIds=[1],
mainArtistsNames=['Me'],
mainArtistId=1,
mainArtistPicture='xxx',
featuringArtistsIds=[''],
albumId=1,
masterBundleId='x',
albumTitle='Victorian',
albumCover='be7c307bc938',
releaseDate=datetime(2010, 6, 8, 0, 0),
albumReleaseDate=datetime(2010, 6, 8, 0, 0),
creditsArtistId=[1],
creditsName=['La La'],
creditsRole=['Main Artist'],
creditsRoleCategory=['HIDDEN'],
numTrackStreams=0,
numTrackStreamers=0,
voicenessScore=0,
voice=1,
genre='Christmas',
originalGenre='Christmas',
AvailableCountryCodes=['AD', 'AE'])])
track_groups_metadata = self.spark.createDataFrame([Row(trackGroup='xxx',
AvailableCountryCodes=['AD', 'AE'])])

precomputed_recs = self.spark.createDataFrame([Row(user=1,
recommendations=['xxx', 'xyz'])])

user_history_tracks = self.spark.createDataFrame([Row(userId=1,
productId=2,
artistId=2,
trackGroup='xyz',
title="Don't Let The Sun Go Down On Me",
cleanedTitle="dd",
count=2,
source='UserTracksHistory',
dt=datetime(2020, 12, 21, 13, 3, 36, 534000))])
user_history_artists = self.spark.createDataFrame([Row(userId=1,
artistId=3,
count=10,
source='UserArtistsHistory',
dt=datetime(2022, 5, 2, 20, 28, 23, 516000))])
user_fav_tracks = self.spark.createDataFrame([Row(userId=1,
productId=5,
artistId=7,
trackGroup='aaa',
title='Breathing Underwater',
cleanedTitle='aa',
count=1,
source='UserTracksFavourite',
dt=datetime(2020, 10, 23, 6, 49, 33))])
user_fav_artists = self.spark.createDataFrame([Row(userId=1,
artistId=111,
count=1,
source='UserArtistsFavourite',
dt=datetime(2019, 11, 21, 13, 31, 11))])

artist_clusters = self.spark.createDataFrame([Row(artistId=1, cluster=42)])

user_observed_tracks = self.spark.createDataFrame([Row(userId=1,
productId=5,
artistId=7,
trackGroup='aaa',
title='Breathing Underwater',
cleanedTitle='aa',
count=1,
source='UserTracksDiscoveryObserved',
dt=datetime(2020, 10, 23, 6, 49, 33))])

user_table = self.spark.createDataFrame([Row(id=1, countrycode='AD')])

user_blacklist_table = self.spark.createDataFrame([Row(artifactId='111',
artifactType='TRACK',
created=1568546619349,
userId='3')])

artist_compound_mapping_table = self.spark.createDataFrame([Row(id=4,
artistid=5,
artistcompoundid=6,
priority=1,
mainartist=False)])

self.data = DiscoveryMixPostProcessorTransformationData(tracks_metadata,
track_groups_metadata,
precomputed_recs,
user_history_tracks,
user_history_artists,
user_fav_tracks,
user_fav_artists,
artist_clusters,
user_observed_tracks,
user_table,
user_blacklist_table,
artist_compound_mapping_table
)

def test_transform(self):
post_processor = DiscoveryMixPostProcessorTransformationTestInterface(self.spark,
threshold_known_artists=1,
mix_size=1,
min_mix_size=0)
post_processor._data = self.data
post_processor.transform()
res = post_processor.output.output.collect()[0]
self.assertEqual(Row(user=1, tracks=['xxx'], mixId='1f1451b3b417516e9e4b4423958', atDate=res.atDate),
res)
Loading

0 comments on commit 8ab2573

Please sign in to comment.