Split ArticleSet into CandidateSet and RecommendationList (#156)

Differentiating these two types allows us to add attributes and methods to each that are specific to either manipulating pools of items or creating ordered lists of items to be recommended, rather than relying on Python primitives and built-ins to create and manipulate lists of articles. Depends on CCRI-POPROX/poprox-concepts#39
CCRI-POPROX · Feb 10, 2025 · 50abbea · 50abbea
1 parent 41788b3
commit 50abbea
Show file tree

Hide file tree

Showing 30 changed files with 1,875 additions and 2,150 deletions.
diff --git a/pixi.lock b/pixi.lock
diff --git a/src/poprox_recommender/components/diversifiers/locality_calibration.py b/src/poprox_recommender/components/diversifiers/locality_calibration.py
@@ -1,6 +1,7 @@
 import torch as th
 
-from poprox_concepts import ArticleSet, InterestProfile
+from poprox_concepts import CandidateSet, InterestProfile
+from poprox_concepts.domain import RecommendationList
 from poprox_recommender.components.diversifiers.calibration import Calibrator
 from poprox_recommender.topics import extract_locality, normalized_category_count
 
@@ -12,7 +13,7 @@ class LocalityCalibrator(Calibrator):
     def __init__(self, theta: float = 0.1, num_slots=10):
         super().__init__(theta, num_slots)
 
-    def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
+    def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
         normalized_locality_prefs = normalized_category_count(interest_profile.click_locality_counts)
 
         if candidate_articles.scores is not None:
@@ -29,7 +30,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
             self.theta,
             topk=self.num_slots,
         )
-        return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
+        return RecommendationList(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
 
     def add_article_to_categories(self, rec_categories, article):
         locality_list = extract_locality(article)

diff --git a/src/poprox_recommender/components/diversifiers/mmr.py b/src/poprox_recommender/components/diversifiers/mmr.py
@@ -1,7 +1,7 @@
 import torch
 from lenskit.pipeline import Component
 
-from poprox_concepts import ArticleSet, InterestProfile
+from poprox_concepts.domain import CandidateSet, InterestProfile, RecommendationList
 from poprox_recommender.pytorch.datachecks import assert_tensor_size
 from poprox_recommender.pytorch.decorators import torch_inference
 
@@ -12,16 +12,17 @@ def __init__(self, theta: float = 0.8, num_slots: int = 10):
         self.num_slots = num_slots
 
     @torch_inference
-    def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
+    def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
         if candidate_articles.scores is None:
-            return candidate_articles
+            recommended = candidate_articles.articles
+        else:
+            similarity_matrix = compute_similarity_matrix(candidate_articles.embeddings)
 
-        similarity_matrix = compute_similarity_matrix(candidate_articles.embeddings)
+            scores = torch.as_tensor(candidate_articles.scores).to(similarity_matrix.device)
+            article_indices = mmr_diversification(scores, similarity_matrix, theta=self.theta, topk=self.num_slots)
+            recommended = [candidate_articles.articles[int(idx)] for idx in article_indices]
 
-        scores = torch.as_tensor(candidate_articles.scores).to(similarity_matrix.device)
-        article_indices = mmr_diversification(scores, similarity_matrix, theta=self.theta, topk=self.num_slots)
-
-        return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
+        return RecommendationList(articles=recommended)
 
 
 def compute_similarity_matrix(todays_article_vectors):

diff --git a/src/poprox_recommender/components/diversifiers/pfar.py b/src/poprox_recommender/components/diversifiers/pfar.py
@@ -3,7 +3,7 @@
 import torch as th
 from lenskit.pipeline import Component
 
-from poprox_concepts import Article, ArticleSet, InterestProfile
+from poprox_concepts.domain import Article, CandidateSet, InterestProfile, RecommendationList
 from poprox_recommender.pytorch.decorators import torch_inference
 from poprox_recommender.topics import GENERAL_TOPICS, extract_general_topics, normalized_category_count
 
@@ -15,33 +15,35 @@ def __init__(self, lambda_: float = 1.0, tau: float | None = None, num_slots: in
         self.num_slots = num_slots
 
     @torch_inference
-    def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
+    def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
         if candidate_articles.scores is None:
-            return candidate_articles
+            articles = candidate_articles.articles
+        else:
+            article_scores = th.sigmoid(th.tensor(candidate_articles.scores)).cpu().detach().numpy()
 
-        article_scores = th.sigmoid(th.tensor(candidate_articles.scores)).cpu().detach().numpy()
+            topic_preferences: dict[str, int] = {}
 
-        topic_preferences: dict[str, int] = {}
+            for interest in interest_profile.onboarding_topics:
+                topic_preferences[interest.entity_name] = max(interest.preference - 1, 0)
 
-        for interest in interest_profile.onboarding_topics:
-            topic_preferences[interest.entity_name] = max(interest.preference - 1, 0)
+            if interest_profile.click_topic_counts:
+                for topic, click_count in interest_profile.click_topic_counts.items():
+                    topic_preferences[topic] = click_count
 
-        if interest_profile.click_topic_counts:
-            for topic, click_count in interest_profile.click_topic_counts.items():
-                topic_preferences[topic] = click_count
+            normalized_topic_prefs = normalized_category_count(topic_preferences)
 
-        normalized_topic_prefs = normalized_category_count(topic_preferences)
+            article_indices = pfar_diversification(
+                article_scores,
+                candidate_articles.articles,
+                normalized_topic_prefs,
+                self.lambda_,
+                self.tau,
+                topk=self.num_slots,
+            )
 
-        article_indices = pfar_diversification(
-            article_scores,
-            candidate_articles.articles,
-            normalized_topic_prefs,
-            self.lambda_,
-            self.tau,
-            topk=self.num_slots,
-        )
+            articles = [candidate_articles.articles[int(idx)] for idx in article_indices]
 
-        return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
+        return RecommendationList(articles=articles)
 
 
 def pfar_diversification(relevance_scores, articles, topic_preferences, lamb, tau, topk) -> list[Article]:

diff --git a/src/poprox_recommender/components/diversifiers/topic_calibration.py b/src/poprox_recommender/components/diversifiers/topic_calibration.py
@@ -2,7 +2,8 @@
 
 import torch as th
 
-from poprox_concepts import ArticleSet, InterestProfile
+from poprox_concepts import CandidateSet, InterestProfile
+from poprox_concepts.domain import RecommendationList
 from poprox_recommender.components.diversifiers.calibration import Calibrator
 from poprox_recommender.topics import extract_general_topics, normalized_category_count
 
@@ -11,7 +12,7 @@
 # to rerank recommendations according to
 # topic calibration
 class TopicCalibrator(Calibrator):
-    def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
+    def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
         normalized_topic_prefs = self.compute_topic_dist(interest_profile)
 
         if candidate_articles.scores is not None:
@@ -29,7 +30,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
             topk=self.num_slots,
         )
 
-        return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
+        return RecommendationList(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
 
     def compute_topic_dist(self, interest_profile):
         topic_preferences: dict[str, int] = defaultdict(int)

diff --git a/src/poprox_recommender/components/embedders/article.py b/src/poprox_recommender/components/embedders/article.py
@@ -9,7 +9,7 @@
 from safetensors.torch import load_file
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from poprox_concepts import ArticleSet
+from poprox_concepts import CandidateSet
 from poprox_recommender.model import ModelConfig
 from poprox_recommender.model.nrms.news_encoder import NewsEncoder
 from poprox_recommender.paths import model_file_path
@@ -58,7 +58,7 @@ def __init__(self, model_path: PathLike, device: str | None):
         self.embedding_cache = {}
 
     @torch_inference
-    def __call__(self, article_set: ArticleSet) -> ArticleSet:
+    def __call__(self, article_set: CandidateSet) -> CandidateSet:
         if not article_set.articles:
             article_set.embeddings = th.zeros((0, self.news_encoder.embedding_size))  # type: ignore
             return article_set
@@ -116,21 +116,21 @@ def __call__(self, article_set: ArticleSet) -> ArticleSet:
 
 class EmbeddingCopier(Component):
     @torch_inference
-    def __call__(self, candidate_set: ArticleSet, selected_set: ArticleSet) -> ArticleSet:
+    def __call__(self, candidate_set: CandidateSet, selected_set: CandidateSet) -> CandidateSet:
         """
         Copies article embeddings from a candidate set to a set of selected/recommended articles
 
         Parameters
         ----------
-        candidate_set : ArticleSet
+        candidate_set : CandidateSet
             A set of candidate articles with the `.embeddings` property filled in
             (e.g. with ArticleEmbedder)
-        selected_set : ArticleSet
+        selected_set : CandidateSet
             A set of selected or recommended articles chosen from `candidate_set`
 
         Returns
         -------
-        ArticleSet
+        CandidateSet
             selected_set with `.embeddings` set using the embeddings from `candidate_set`
         """
         candidate_article_ids = [article.article_id for article in candidate_set.articles]

diff --git a/src/poprox_recommender/components/embedders/topic_wise_user.py b/src/poprox_recommender/components/embedders/topic_wise_user.py
@@ -3,7 +3,7 @@
 
 import torch as th
 
-from poprox_concepts import Article, ArticleSet, Click, InterestProfile
+from poprox_concepts import Article, CandidateSet, Click, InterestProfile
 from poprox_recommender.components.embedders import NRMSArticleEmbedder, NRMSUserEmbedder
 from poprox_recommender.paths import model_file_path
 from poprox_recommender.pytorch.decorators import torch_inference
@@ -116,7 +116,7 @@ def virtual_clicks(onboarding_topics, topic_articles):
 
 class UserOnboardingEmbedder(NRMSUserEmbedder):
     article_embedder: NRMSArticleEmbedder
-    embedded_topic_articles: ArticleSet | None = None
+    embedded_topic_articles: CandidateSet | None = None
 
     def __init__(self, *args, embedding_source: str = "static", topic_embedding: str = "nrms", **kwargs):
         super().__init__(*args, **kwargs)
@@ -128,10 +128,10 @@ def __init__(self, *args, embedding_source: str = "static", topic_embedding: str
 
     @torch_inference
     def __call__(
-        self, candidate_articles: ArticleSet, clicked_articles: ArticleSet, interest_profile: InterestProfile
+        self, candidate_articles: CandidateSet, clicked_articles: CandidateSet, interest_profile: InterestProfile
     ) -> InterestProfile:
         if self.embedded_topic_articles is None:
-            self.embedded_topic_articles = self.article_embedder(ArticleSet(articles=TOPIC_ARTICLES))
+            self.embedded_topic_articles = self.article_embedder(CandidateSet(articles=TOPIC_ARTICLES))
 
         topic_embeddings_by_uuid = {
             article.article_id: embedding
@@ -182,15 +182,15 @@ def __call__(
 
         return interest_profile
 
-    def build_article_lookup(self, article_set: ArticleSet):
+    def build_article_lookup(self, article_set: CandidateSet):
         embedding_lookup = {}
         for article, article_vector in zip(article_set.articles, article_set.embeddings, strict=True):
             if article.article_id not in embedding_lookup:
                 embedding_lookup[article.article_id] = article_vector
 
         return embedding_lookup
 
-    def build_embeddings_from_articles(self, articles: ArticleSet, topic_articles: list[Article]):
+    def build_embeddings_from_articles(self, articles: CandidateSet, topic_articles: list[Article]):
         topic_uuids_by_name = {article.external_id: article.article_id for article in topic_articles}
 
         topic_embeddings_by_uuid = {}
@@ -220,7 +220,7 @@ def find_topical_articles(self, topic: str, articles: list[Article]) -> list[Art
         return topical_articles
 
     def build_embeddings_from_definitions(self):
-        topic_article_set = self.article_embedder(ArticleSet(articles=TOPIC_ARTICLES))
+        topic_article_set = self.article_embedder(CandidateSet(articles=TOPIC_ARTICLES))
 
         topic_embeddings_by_uuid = {
             article.article_id: embedding for article, embedding in zip(TOPIC_ARTICLES, topic_article_set.embeddings)

diff --git a/src/poprox_recommender/components/embedders/user.py b/src/poprox_recommender/components/embedders/user.py
@@ -4,7 +4,7 @@
 from lenskit.pipeline import Component
 from safetensors.torch import load_file
 
-from poprox_concepts import ArticleSet, Click, InterestProfile
+from poprox_concepts import CandidateSet, Click, InterestProfile
 from poprox_recommender.model import ModelConfig
 from poprox_recommender.model.nrms.user_encoder import UserEncoder
 from poprox_recommender.pytorch.decorators import torch_inference
@@ -23,7 +23,7 @@ def __init__(self, model_path: PathLike, device: str = "cpu", max_clicks_per_use
         self.user_encoder.to(device)
 
     @torch_inference
-    def __call__(self, clicked_articles: ArticleSet, interest_profile: InterestProfile) -> InterestProfile:
+    def __call__(self, clicked_articles: CandidateSet, interest_profile: InterestProfile) -> InterestProfile:
         if len(clicked_articles.articles) == 0:
             interest_profile.embedding = None
         else:

diff --git a/src/poprox_recommender/components/filters/topic.py b/src/poprox_recommender/components/filters/topic.py
@@ -2,13 +2,13 @@
 
 from lenskit.pipeline import Component
 
-from poprox_concepts import ArticleSet, InterestProfile
+from poprox_concepts import CandidateSet, InterestProfile
 
 logger = logging.getLogger(__name__)
 
 
 class TopicFilter(Component):
-    def __call__(self, candidate: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
+    def __call__(self, candidate: CandidateSet, interest_profile: InterestProfile) -> CandidateSet:
         # Preference values from onboarding are 1-indexed, where 1 means "absolutely no interest."
         # We might want to normalize them to 0-indexed somewhere upstream, but in the mean time
         # this is one of the simpler ways to filter out topics people aren't interested in from
@@ -32,4 +32,4 @@ def __call__(self, candidate: ArticleSet, interest_profile: InterestProfile) ->
             len(candidate.articles),
             interest_profile.profile_id,
         )
-        return ArticleSet(articles=topical_articles)
+        return CandidateSet(articles=topical_articles)
diff --git a/src/poprox_recommender/components/joiners/concat.py b/src/poprox_recommender/components/joiners/concat.py
@@ -1,10 +1,10 @@
 from lenskit.pipeline import Component
 
-from poprox_concepts import ArticleSet
+from poprox_concepts.domain import RecommendationList
 
 
 class Concatenate(Component):
-    def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleSet:
+    def __call__(self, recs1: RecommendationList, recs2: RecommendationList) -> RecommendationList:
         """
         Concatenates two sets of candidates, while deduplicating them, keeping the
         first occurrence of each article (by id), and maintaining their original order.
@@ -15,7 +15,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleS
         the dict keys can be ignored and the dict values are the deduplicated candidates
         in reverse order. Reversing them one more time returns them to the original order.
         """
-        reverse_articles = reversed(candidates1.articles + candidates2.articles)
+        reverse_articles = reversed(recs1.articles + recs2.articles)
         articles = {article.article_id: article for article in reverse_articles}
 
-        return ArticleSet(articles=list(reversed(articles.values())))
+        return RecommendationList(articles=list(reversed(articles.values())))
diff --git a/src/poprox_recommender/components/joiners/fill.py b/src/poprox_recommender/components/joiners/fill.py
@@ -1,16 +1,19 @@
 from lenskit.pipeline import Component
 from lenskit.pipeline.types import Lazy
 
-from poprox_concepts import ArticleSet
+from poprox_concepts import CandidateSet
+from poprox_concepts.domain import RecommendationList
 
 
 class Fill(Component):
     def __init__(self, num_slots: int, deduplicate: bool = True):
         self.num_slots = num_slots
         self.deduplicate = deduplicate
 
-    def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> ArticleSet:
-        articles = candidates1.articles
+    def __call__(
+        self, recs1: CandidateSet | RecommendationList, recs2: Lazy[CandidateSet | RecommendationList]
+    ) -> RecommendationList:
+        articles = recs1.articles
 
         if self.deduplicate:
             # Track the articles by their article_id
@@ -19,7 +22,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> Ar
             # Add articles from candidates2 only if they are not duplicates
             if len(articles) < self.num_slots:
                 new_articles = []
-                for article in candidates2.get().articles:
+                for article in recs2.get().articles:
                     # Check if the article is a duplicate based on article_id
                     if (article.article_id) not in existing_articles:
                         new_articles.append(article)
@@ -30,7 +33,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> Ar
 
                 articles = articles + new_articles
         else:
-            articles = articles + candidates2.get().articles
+            articles = articles + recs2.get().articles
 
-        # Return the resulting ArticleSet, limiting the size to num_slots
-        return ArticleSet(articles=articles[: self.num_slots])
+        # Return the resulting RecommendationList, limiting the size to num_slots
+        return RecommendationList(articles=articles[: self.num_slots])
diff --git a/src/poprox_recommender/components/joiners/interleave.py b/src/poprox_recommender/components/joiners/interleave.py
@@ -2,15 +2,15 @@
 
 from lenskit.pipeline import Component
 
-from poprox_concepts import ArticleSet
+from poprox_concepts.domain import RecommendationList
 
 
 class Interleave(Component):
-    def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleSet:
+    def __call__(self, recs1: RecommendationList, recs2: RecommendationList) -> RecommendationList:
         articles = []
-        for pair in zip_longest(candidates1.articles, candidates2.articles):
+        for pair in zip_longest(recs1.articles, recs2.articles):
             for article in pair:
                 if article is not None:
                     articles.append(article)
 
-        return ArticleSet(articles=articles)
+        return RecommendationList(articles=articles)