add discovery mix

tidal-music · Sep 26, 2023 · 8ab2573 · 8ab2573
1 parent baf4d84
commit 8ab2573
Show file tree

Hide file tree

Showing 42 changed files with 1,682 additions and 2 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,37 @@
+name: Pypi publish
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build-n-publish:
+    name: Build and publish Python distributions to PyPI
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.8'
+
+      - name: Check out code
+        uses: actions/checkout@v2
+
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
+
+      - name: Install poetry
+        run: pip install poetry==1.5.1
+
+      - name: Install dependencies
+        run: poetry install --no-root
+
+#      - name: Build package
+#        run: python3 -m build
+#
+#      - name: Publish distribution to PyPI
+#        uses: pypa/gh-action-pypi-publish@release/v1
+#        with:
+#          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,34 @@
+name: Unit tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+
+    - name: Check out code
+      uses: actions/checkout@v2
+
+    - name: Upgrade pip
+      run: python -m pip install --upgrade pip
+
+    - name: Install poetry
+      run: pip install poetry==1.5.1
+
+    - name: Install dependencies
+      run: poetry install --no-root
+
+    - name: Run unit tests
+      run: PYTHONPATH=tidal_algorithmic_mixes:test poetry run pytest test/
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+/.python-version
+/poetry.lock
+*.pyc
diff --git a/README.md b/README.md
@@ -1,2 +1,17 @@
-# tidal-algorithmic-mixes
-Tidal algorithmic mixes
+# Tidal algorithmic mixes
+
+This contains the logic of how tidal create its algorithmic offline mixes,
+how it utilizes different machine learning models, 
+alongside business rules to create different mixes for different use cases, 
+included personalized mixes (like my mix, my new arrivals and daily discovery)
+and non-personalized like track radio and artist radio.
+
+- Make sure you have pyenv and [pyenv](https://github.com/pyenv/pyenv) amd [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv) installed on your local environment.
+- Install python 3.8.16 with pyenv `pyenv install 3.8.16`.
+- Set up a new virtual env `pyenv virtualenv 3.8.16 mixes`
+- Set local pyenv version `pyenv local mixes`
+- Activate the virtual pyenv using `pyenv activate mixes`
+- Upgrade the pip package installer `pip install --upgrade pip`
+- Install poetry for package management `pip install poetry==1.5.1`
+- Install dependencies from the lock file `poetry install --no-root` 
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,53 @@
+[project]
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: Apache License V 2.0",
+    "Operating System :: OS Independent",
+]
+
+[project.urls]
+"GitHub" = "https://github.com/tidal-music/tidal-algorithmic-mixes"
+
+[tool.poetry]
+name = "tidal_algorithmic_mixes"
+version = "0.0.1"
+description = "common transformers used by the tidal personalization team."
+authors = [
+    "Loay <[email protected]>",
+    "Jing <[email protected]>",
+    "Tao <[email protected]>",
+    "Thomas <[email protected]>",
+    "Yuhua [email protected]"
+]
+
+license = "Apache License V 2.0"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.8.0"
+pyspark = "3.4.0"
+numpy = ">=1.16.4"
+s3fs = "2022.11.0"
+boto3 = "1.24.59"
+pandas = ">=1.4.2"
+great-expectations = "0.16.15"
+scikit-learn = "1.1.1"
+alphabet-detector = "0.0.7"
+pyarrow = "7.0.0"
+tidal-per-transformers = "0.0.4"
+torch = "1.9.1"
+mlflow = "2.1.1"
+
+[tool.poetry.group.dev.dependencies]
+pytest = "6.1.2"
+coverage = ">=4.5.2"
+pytest-cov = ">=2.6.1"
+coveralls = ">=1.6.0"
+mock = ">=2.0.0"
+moto = ">=3.1.11"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/test/__init__.py b/test/__init__.py
diff --git a/test/discovery_mix/__init__.py b/test/discovery_mix/__init__.py
diff --git a/test/discovery_mix/test_daily_update_transformation.py b/test/discovery_mix/test_daily_update_transformation.py
@@ -0,0 +1,43 @@
+from datetime import date
+
+import tidal_algorithmic_mixes.utils.constants as c
+
+from test.pyspark_test import PySparkTest
+from tidal_algorithmic_mixes.discovery_mix.daily_update_transformation import DiscoveryMixDailyUpdateTransformation, \
+    DiscoveryMixDailyUpdateTransformationData
+
+
+class DiscoveryMixDailyUpdateTransformationTestInterface(DiscoveryMixDailyUpdateTransformation):
+    def extract(self, *args, **kwargs):
+        ...
+
+    def validate(self, *args, **kwargs):
+        ...
+
+    def load(self, *args, **kwargs):
+        ...
+
+
+class DiscoveryMixDailyUpdateTest(PySparkTest):
+
+    def test_slicer(self):
+        mixes = self.spark.createDataFrame([
+            (0, [10, 11, 12, 13, 14, 15, 16]),
+            (1, [10, 11, 12, 13, 14, 15, 16]),
+            (2, [10, 11, 12, 13, 14, 15, 16]),
+            (3, [10, 11, 12, 13, 14, 15, 16])
+        ], [c.USER, c.TRACKS])
+
+        runner = DiscoveryMixDailyUpdateTransformationTestInterface(self.spark)
+
+        runner._data = DiscoveryMixDailyUpdateTransformationData(mixes)
+
+        self.assertEqual(runner.slicer(mixes, date(2021, 2, 15), 1).collect()[0][c.TRACKS][0], 10)
+        self.assertEqual(runner.slicer(mixes, date(2021, 2, 18), 1).collect()[0][c.TRACKS][0], 13)
+        self.assertEqual(runner.slicer(mixes, date(2021, 2, 21), 1).collect()[0][c.TRACKS][0], 16)
+
+    def test_offset(self):
+        runner = DiscoveryMixDailyUpdateTransformationTestInterface(self.spark)
+        self.assertEqual(runner.offset(date(2021, 2, 15), 10), 0)
+        self.assertEqual(runner.offset(date(2021, 2, 18), 10), 30)
+        self.assertEqual(runner.offset(date(2021, 2, 21), 10), 60)
diff --git a/test/discovery_mix/test_observed_tracks_aggregator.py b/test/discovery_mix/test_observed_tracks_aggregator.py
@@ -0,0 +1,55 @@
+import tidal_algorithmic_mixes.utils.constants as c
+from pyspark_test import PySparkTest
+from tidal_algorithmic_mixes.discovery_mix.observed_tracks_aggregator_transformation import \
+    ObservedDiscoveryMixTracksAggregatorTransformation, ObservedDiscoveryMixTracksAggregatorTransformationData
+
+
+class ObservedDiscoveryMixTracksAggregatorTransformationTestInterface(
+        ObservedDiscoveryMixTracksAggregatorTransformation):
+    def extract(self, *args, **kwargs):
+        ...
+
+    def validate(self, *args, **kwargs):
+        ...
+
+    def load(self, *args, **kwargs):
+        ...
+
+
+class ObservedDiscoveryMixTracksAggregatorTest(PySparkTest):
+
+    def setUp(self):
+        super().setUp()
+
+    def test_transform(self):
+        user_1 = 26129743
+        user_2 = 43727840
+
+        user_1_mix = "5b5b0f74b66cbecf46de5f00297"
+        user_2_mix = "c71e7c0b5f8daeaff1bdea48f9f"
+
+        tracks_user_1 = [1, 2, 3, 4, 5, 6]
+        tracks_user_2 = [3, 4, 5, 6, 7, 9]
+
+        mixes = self.spark.createDataFrame([
+            (user_1_mix,),
+            (user_2_mix,),
+        ], [c.MIX_ID])
+
+        observed_mixes = self.spark.createDataFrame([
+            (user_1_mix, user_1, tracks_user_1),
+            (user_2_mix, user_2, tracks_user_2),
+            ("xvxfewfwsdf34r3sf3jfaae4tgs", 1664, [11, 22, 33, 44]),
+            ("a71e7xffw4rzdzdf34zsz23ead3", 1984, [55, 66, 77, 11, 22]),
+        ], [c.MIX_ID, c.USER, c.TRACKS])
+
+        runner = ObservedDiscoveryMixTracksAggregatorTransformationTestInterface(self.spark)
+        runner._data = ObservedDiscoveryMixTracksAggregatorTransformationData(observed_mixes=observed_mixes,
+                                                                              mixes=mixes)
+        runner.transform()
+        res = runner.output.output
+
+        self.assertEqual(res.columns,  [c.USER, c.TRACK_GROUP])
+        self.assertEqual(res.count(), len(tracks_user_1) + len(tracks_user_2))
+
+        self.assertEqual([user_1, user_2], ([x[c.USER] for x in res.select(c.USER).distinct().collect()]))
diff --git a/test/discovery_mix/test_post_processor.py b/test/discovery_mix/test_post_processor.py
@@ -0,0 +1,141 @@
+from datetime import datetime
+from pyspark.sql.types import Row
+from pyspark_test import PySparkTest
+from tidal_algorithmic_mixes.discovery_mix.post_processor_transformation import\
+    DiscoveryMixPostProcessorTransformation, DiscoveryMixPostProcessorTransformationData
+
+
+class DiscoveryMixPostProcessorTransformationTestInterface(DiscoveryMixPostProcessorTransformation):
+    def extract(self, *args, **kwargs):
+        ...
+
+    def validate(self, *args, **kwargs):
+        ...
+
+    def load(self, *args, **kwargs):
+        ...
+
+
+class DiscoveryMixPostProcessorTest(PySparkTest):
+
+    def setUp(self):
+        tracks_metadata = self.spark.createDataFrame([Row(id=1,
+                                                          title='Chime again',
+                                                          popularityWW=0,
+                                                          trackNumber=16,
+                                                          volumeNumber=1,
+                                                          numAlbums=3,
+                                                          explicit=False,
+                                                          generatedFromVideo=False,
+                                                          trackGroup='xxx',
+                                                          audioQuality='LOSSLESS',
+                                                          available=True,
+                                                          version='x',
+                                                          duration=192,
+                                                          mixes={'x': 'y'},
+                                                          mainArtistsIds=[1],
+                                                          mainArtistsNames=['Me'],
+                                                          mainArtistId=1,
+                                                          mainArtistPicture='xxx',
+                                                          featuringArtistsIds=[''],
+                                                          albumId=1,
+                                                          masterBundleId='x',
+                                                          albumTitle='Victorian',
+                                                          albumCover='be7c307bc938',
+                                                          releaseDate=datetime(2010, 6, 8, 0, 0),
+                                                          albumReleaseDate=datetime(2010, 6, 8, 0, 0),
+                                                          creditsArtistId=[1],
+                                                          creditsName=['La La'],
+                                                          creditsRole=['Main Artist'],
+                                                          creditsRoleCategory=['HIDDEN'],
+                                                          numTrackStreams=0,
+                                                          numTrackStreamers=0,
+                                                          voicenessScore=0,
+                                                          voice=1,
+                                                          genre='Christmas',
+                                                          originalGenre='Christmas',
+                                                          AvailableCountryCodes=['AD', 'AE'])])
+        track_groups_metadata = self.spark.createDataFrame([Row(trackGroup='xxx',
+                                                                AvailableCountryCodes=['AD', 'AE'])])
+
+        precomputed_recs = self.spark.createDataFrame([Row(user=1,
+                                                           recommendations=['xxx', 'xyz'])])
+
+        user_history_tracks = self.spark.createDataFrame([Row(userId=1,
+                                                              productId=2,
+                                                              artistId=2,
+                                                              trackGroup='xyz',
+                                                              title="Don't Let The Sun Go Down On Me",
+                                                              cleanedTitle="dd",
+                                                              count=2,
+                                                              source='UserTracksHistory',
+                                                              dt=datetime(2020, 12, 21, 13, 3, 36, 534000))])
+        user_history_artists = self.spark.createDataFrame([Row(userId=1,
+                                                               artistId=3,
+                                                               count=10,
+                                                               source='UserArtistsHistory',
+                                                               dt=datetime(2022, 5, 2, 20, 28, 23, 516000))])
+        user_fav_tracks = self.spark.createDataFrame([Row(userId=1,
+                                                          productId=5,
+                                                          artistId=7,
+                                                          trackGroup='aaa',
+                                                          title='Breathing Underwater',
+                                                          cleanedTitle='aa',
+                                                          count=1,
+                                                          source='UserTracksFavourite',
+                                                          dt=datetime(2020, 10, 23, 6, 49, 33))])
+        user_fav_artists = self.spark.createDataFrame([Row(userId=1,
+                                                           artistId=111,
+                                                           count=1,
+                                                           source='UserArtistsFavourite',
+                                                           dt=datetime(2019, 11, 21, 13, 31, 11))])
+
+        artist_clusters = self.spark.createDataFrame([Row(artistId=1, cluster=42)])
+
+        user_observed_tracks = self.spark.createDataFrame([Row(userId=1,
+                                                               productId=5,
+                                                               artistId=7,
+                                                               trackGroup='aaa',
+                                                               title='Breathing Underwater',
+                                                               cleanedTitle='aa',
+                                                               count=1,
+                                                               source='UserTracksDiscoveryObserved',
+                                                               dt=datetime(2020, 10, 23, 6, 49, 33))])
+
+        user_table = self.spark.createDataFrame([Row(id=1, countrycode='AD')])
+
+        user_blacklist_table = self.spark.createDataFrame([Row(artifactId='111',
+                                                               artifactType='TRACK',
+                                                               created=1568546619349,
+                                                               userId='3')])
+
+        artist_compound_mapping_table = self.spark.createDataFrame([Row(id=4,
+                                                                        artistid=5,
+                                                                        artistcompoundid=6,
+                                                                        priority=1,
+                                                                        mainartist=False)])
+
+        self.data = DiscoveryMixPostProcessorTransformationData(tracks_metadata,
+                                                                track_groups_metadata,
+                                                                precomputed_recs,
+                                                                user_history_tracks,
+                                                                user_history_artists,
+                                                                user_fav_tracks,
+                                                                user_fav_artists,
+                                                                artist_clusters,
+                                                                user_observed_tracks,
+                                                                user_table,
+                                                                user_blacklist_table,
+                                                                artist_compound_mapping_table
+                                                                )
+
+    def test_transform(self):
+        post_processor = DiscoveryMixPostProcessorTransformationTestInterface(self.spark,
+                                                                              threshold_known_artists=1,
+                                                                              mix_size=1,
+                                                                              min_mix_size=0)
+        post_processor._data = self.data
+        post_processor.transform()
+        res = post_processor.output.output.collect()[0]
+        self.assertEqual(Row(user=1, tracks=['xxx'], mixId='1f1451b3b417516e9e4b4423958', atDate=res.atDate),
+                         res)