Skip to content

Commit

Permalink
Split ArticleSet into CandidateSet and RecommendationList (#156)
Browse files Browse the repository at this point in the history
Differentiating these two types allows us to add attributes and methods
to each that are specific to either manipulating pools of items or
creating ordered lists of items to be recommended, rather than relying
on Python primitives and built-ins to create and manipulate lists of
articles.

Depends on CCRI-POPROX/poprox-concepts#39
  • Loading branch information
karlhigley authored Feb 10, 2025
1 parent 41788b3 commit 50abbea
Show file tree
Hide file tree
Showing 30 changed files with 1,875 additions and 2,150 deletions.
3,651 changes: 1,682 additions & 1,969 deletions pixi.lock

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch as th

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile
from poprox_concepts.domain import RecommendationList
from poprox_recommender.components.diversifiers.calibration import Calibrator
from poprox_recommender.topics import extract_locality, normalized_category_count

Expand All @@ -12,7 +13,7 @@ class LocalityCalibrator(Calibrator):
def __init__(self, theta: float = 0.1, num_slots=10):
super().__init__(theta, num_slots)

def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
normalized_locality_prefs = normalized_category_count(interest_profile.click_locality_counts)

if candidate_articles.scores is not None:
Expand All @@ -29,7 +30,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
self.theta,
topk=self.num_slots,
)
return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return RecommendationList(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])

def add_article_to_categories(self, rec_categories, article):
locality_list = extract_locality(article)
Expand Down
17 changes: 9 additions & 8 deletions src/poprox_recommender/components/diversifiers/mmr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
from lenskit.pipeline import Component

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts.domain import CandidateSet, InterestProfile, RecommendationList
from poprox_recommender.pytorch.datachecks import assert_tensor_size
from poprox_recommender.pytorch.decorators import torch_inference

Expand All @@ -12,16 +12,17 @@ def __init__(self, theta: float = 0.8, num_slots: int = 10):
self.num_slots = num_slots

@torch_inference
def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
if candidate_articles.scores is None:
return candidate_articles
recommended = candidate_articles.articles
else:
similarity_matrix = compute_similarity_matrix(candidate_articles.embeddings)

similarity_matrix = compute_similarity_matrix(candidate_articles.embeddings)
scores = torch.as_tensor(candidate_articles.scores).to(similarity_matrix.device)
article_indices = mmr_diversification(scores, similarity_matrix, theta=self.theta, topk=self.num_slots)
recommended = [candidate_articles.articles[int(idx)] for idx in article_indices]

scores = torch.as_tensor(candidate_articles.scores).to(similarity_matrix.device)
article_indices = mmr_diversification(scores, similarity_matrix, theta=self.theta, topk=self.num_slots)

return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return RecommendationList(articles=recommended)


def compute_similarity_matrix(todays_article_vectors):
Expand Down
42 changes: 22 additions & 20 deletions src/poprox_recommender/components/diversifiers/pfar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch as th
from lenskit.pipeline import Component

from poprox_concepts import Article, ArticleSet, InterestProfile
from poprox_concepts.domain import Article, CandidateSet, InterestProfile, RecommendationList
from poprox_recommender.pytorch.decorators import torch_inference
from poprox_recommender.topics import GENERAL_TOPICS, extract_general_topics, normalized_category_count

Expand All @@ -15,33 +15,35 @@ def __init__(self, lambda_: float = 1.0, tau: float | None = None, num_slots: in
self.num_slots = num_slots

@torch_inference
def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
if candidate_articles.scores is None:
return candidate_articles
articles = candidate_articles.articles
else:
article_scores = th.sigmoid(th.tensor(candidate_articles.scores)).cpu().detach().numpy()

article_scores = th.sigmoid(th.tensor(candidate_articles.scores)).cpu().detach().numpy()
topic_preferences: dict[str, int] = {}

topic_preferences: dict[str, int] = {}
for interest in interest_profile.onboarding_topics:
topic_preferences[interest.entity_name] = max(interest.preference - 1, 0)

for interest in interest_profile.onboarding_topics:
topic_preferences[interest.entity_name] = max(interest.preference - 1, 0)
if interest_profile.click_topic_counts:
for topic, click_count in interest_profile.click_topic_counts.items():
topic_preferences[topic] = click_count

if interest_profile.click_topic_counts:
for topic, click_count in interest_profile.click_topic_counts.items():
topic_preferences[topic] = click_count
normalized_topic_prefs = normalized_category_count(topic_preferences)

normalized_topic_prefs = normalized_category_count(topic_preferences)
article_indices = pfar_diversification(
article_scores,
candidate_articles.articles,
normalized_topic_prefs,
self.lambda_,
self.tau,
topk=self.num_slots,
)

article_indices = pfar_diversification(
article_scores,
candidate_articles.articles,
normalized_topic_prefs,
self.lambda_,
self.tau,
topk=self.num_slots,
)
articles = [candidate_articles.articles[int(idx)] for idx in article_indices]

return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return RecommendationList(articles=articles)


def pfar_diversification(relevance_scores, articles, topic_preferences, lamb, tau, topk) -> list[Article]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import torch as th

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile
from poprox_concepts.domain import RecommendationList
from poprox_recommender.components.diversifiers.calibration import Calibrator
from poprox_recommender.topics import extract_general_topics, normalized_category_count

Expand All @@ -11,7 +12,7 @@
# to rerank recommendations according to
# topic calibration
class TopicCalibrator(Calibrator):
def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
normalized_topic_prefs = self.compute_topic_dist(interest_profile)

if candidate_articles.scores is not None:
Expand All @@ -29,7 +30,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
topk=self.num_slots,
)

return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return RecommendationList(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])

def compute_topic_dist(self, interest_profile):
topic_preferences: dict[str, int] = defaultdict(int)
Expand Down
12 changes: 6 additions & 6 deletions src/poprox_recommender/components/embedders/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from safetensors.torch import load_file
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from poprox_concepts import ArticleSet
from poprox_concepts import CandidateSet
from poprox_recommender.model import ModelConfig
from poprox_recommender.model.nrms.news_encoder import NewsEncoder
from poprox_recommender.paths import model_file_path
Expand Down Expand Up @@ -58,7 +58,7 @@ def __init__(self, model_path: PathLike, device: str | None):
self.embedding_cache = {}

@torch_inference
def __call__(self, article_set: ArticleSet) -> ArticleSet:
def __call__(self, article_set: CandidateSet) -> CandidateSet:
if not article_set.articles:
article_set.embeddings = th.zeros((0, self.news_encoder.embedding_size)) # type: ignore
return article_set
Expand Down Expand Up @@ -116,21 +116,21 @@ def __call__(self, article_set: ArticleSet) -> ArticleSet:

class EmbeddingCopier(Component):
@torch_inference
def __call__(self, candidate_set: ArticleSet, selected_set: ArticleSet) -> ArticleSet:
def __call__(self, candidate_set: CandidateSet, selected_set: CandidateSet) -> CandidateSet:
"""
Copies article embeddings from a candidate set to a set of selected/recommended articles
Parameters
----------
candidate_set : ArticleSet
candidate_set : CandidateSet
A set of candidate articles with the `.embeddings` property filled in
(e.g. with ArticleEmbedder)
selected_set : ArticleSet
selected_set : CandidateSet
A set of selected or recommended articles chosen from `candidate_set`
Returns
-------
ArticleSet
CandidateSet
selected_set with `.embeddings` set using the embeddings from `candidate_set`
"""
candidate_article_ids = [article.article_id for article in candidate_set.articles]
Expand Down
14 changes: 7 additions & 7 deletions src/poprox_recommender/components/embedders/topic_wise_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import torch as th

from poprox_concepts import Article, ArticleSet, Click, InterestProfile
from poprox_concepts import Article, CandidateSet, Click, InterestProfile
from poprox_recommender.components.embedders import NRMSArticleEmbedder, NRMSUserEmbedder
from poprox_recommender.paths import model_file_path
from poprox_recommender.pytorch.decorators import torch_inference
Expand Down Expand Up @@ -116,7 +116,7 @@ def virtual_clicks(onboarding_topics, topic_articles):

class UserOnboardingEmbedder(NRMSUserEmbedder):
article_embedder: NRMSArticleEmbedder
embedded_topic_articles: ArticleSet | None = None
embedded_topic_articles: CandidateSet | None = None

def __init__(self, *args, embedding_source: str = "static", topic_embedding: str = "nrms", **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -128,10 +128,10 @@ def __init__(self, *args, embedding_source: str = "static", topic_embedding: str

@torch_inference
def __call__(
self, candidate_articles: ArticleSet, clicked_articles: ArticleSet, interest_profile: InterestProfile
self, candidate_articles: CandidateSet, clicked_articles: CandidateSet, interest_profile: InterestProfile
) -> InterestProfile:
if self.embedded_topic_articles is None:
self.embedded_topic_articles = self.article_embedder(ArticleSet(articles=TOPIC_ARTICLES))
self.embedded_topic_articles = self.article_embedder(CandidateSet(articles=TOPIC_ARTICLES))

topic_embeddings_by_uuid = {
article.article_id: embedding
Expand Down Expand Up @@ -182,15 +182,15 @@ def __call__(

return interest_profile

def build_article_lookup(self, article_set: ArticleSet):
def build_article_lookup(self, article_set: CandidateSet):
embedding_lookup = {}
for article, article_vector in zip(article_set.articles, article_set.embeddings, strict=True):
if article.article_id not in embedding_lookup:
embedding_lookup[article.article_id] = article_vector

return embedding_lookup

def build_embeddings_from_articles(self, articles: ArticleSet, topic_articles: list[Article]):
def build_embeddings_from_articles(self, articles: CandidateSet, topic_articles: list[Article]):
topic_uuids_by_name = {article.external_id: article.article_id for article in topic_articles}

topic_embeddings_by_uuid = {}
Expand Down Expand Up @@ -220,7 +220,7 @@ def find_topical_articles(self, topic: str, articles: list[Article]) -> list[Art
return topical_articles

def build_embeddings_from_definitions(self):
topic_article_set = self.article_embedder(ArticleSet(articles=TOPIC_ARTICLES))
topic_article_set = self.article_embedder(CandidateSet(articles=TOPIC_ARTICLES))

topic_embeddings_by_uuid = {
article.article_id: embedding for article, embedding in zip(TOPIC_ARTICLES, topic_article_set.embeddings)
Expand Down
4 changes: 2 additions & 2 deletions src/poprox_recommender/components/embedders/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from lenskit.pipeline import Component
from safetensors.torch import load_file

from poprox_concepts import ArticleSet, Click, InterestProfile
from poprox_concepts import CandidateSet, Click, InterestProfile
from poprox_recommender.model import ModelConfig
from poprox_recommender.model.nrms.user_encoder import UserEncoder
from poprox_recommender.pytorch.decorators import torch_inference
Expand All @@ -23,7 +23,7 @@ def __init__(self, model_path: PathLike, device: str = "cpu", max_clicks_per_use
self.user_encoder.to(device)

@torch_inference
def __call__(self, clicked_articles: ArticleSet, interest_profile: InterestProfile) -> InterestProfile:
def __call__(self, clicked_articles: CandidateSet, interest_profile: InterestProfile) -> InterestProfile:
if len(clicked_articles.articles) == 0:
interest_profile.embedding = None
else:
Expand Down
6 changes: 3 additions & 3 deletions src/poprox_recommender/components/filters/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

from lenskit.pipeline import Component

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile

logger = logging.getLogger(__name__)


class TopicFilter(Component):
def __call__(self, candidate: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate: CandidateSet, interest_profile: InterestProfile) -> CandidateSet:
# Preference values from onboarding are 1-indexed, where 1 means "absolutely no interest."
# We might want to normalize them to 0-indexed somewhere upstream, but in the mean time
# this is one of the simpler ways to filter out topics people aren't interested in from
Expand All @@ -32,4 +32,4 @@ def __call__(self, candidate: ArticleSet, interest_profile: InterestProfile) ->
len(candidate.articles),
interest_profile.profile_id,
)
return ArticleSet(articles=topical_articles)
return CandidateSet(articles=topical_articles)
8 changes: 4 additions & 4 deletions src/poprox_recommender/components/joiners/concat.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from lenskit.pipeline import Component

from poprox_concepts import ArticleSet
from poprox_concepts.domain import RecommendationList


class Concatenate(Component):
def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleSet:
def __call__(self, recs1: RecommendationList, recs2: RecommendationList) -> RecommendationList:
"""
Concatenates two sets of candidates, while deduplicating them, keeping the
first occurrence of each article (by id), and maintaining their original order.
Expand All @@ -15,7 +15,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleS
the dict keys can be ignored and the dict values are the deduplicated candidates
in reverse order. Reversing them one more time returns them to the original order.
"""
reverse_articles = reversed(candidates1.articles + candidates2.articles)
reverse_articles = reversed(recs1.articles + recs2.articles)
articles = {article.article_id: article for article in reverse_articles}

return ArticleSet(articles=list(reversed(articles.values())))
return RecommendationList(articles=list(reversed(articles.values())))
17 changes: 10 additions & 7 deletions src/poprox_recommender/components/joiners/fill.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from lenskit.pipeline import Component
from lenskit.pipeline.types import Lazy

from poprox_concepts import ArticleSet
from poprox_concepts import CandidateSet
from poprox_concepts.domain import RecommendationList


class Fill(Component):
def __init__(self, num_slots: int, deduplicate: bool = True):
self.num_slots = num_slots
self.deduplicate = deduplicate

def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> ArticleSet:
articles = candidates1.articles
def __call__(
self, recs1: CandidateSet | RecommendationList, recs2: Lazy[CandidateSet | RecommendationList]
) -> RecommendationList:
articles = recs1.articles

if self.deduplicate:
# Track the articles by their article_id
Expand All @@ -19,7 +22,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> Ar
# Add articles from candidates2 only if they are not duplicates
if len(articles) < self.num_slots:
new_articles = []
for article in candidates2.get().articles:
for article in recs2.get().articles:
# Check if the article is a duplicate based on article_id
if (article.article_id) not in existing_articles:
new_articles.append(article)
Expand All @@ -30,7 +33,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> Ar

articles = articles + new_articles
else:
articles = articles + candidates2.get().articles
articles = articles + recs2.get().articles

# Return the resulting ArticleSet, limiting the size to num_slots
return ArticleSet(articles=articles[: self.num_slots])
# Return the resulting RecommendationList, limiting the size to num_slots
return RecommendationList(articles=articles[: self.num_slots])
8 changes: 4 additions & 4 deletions src/poprox_recommender/components/joiners/interleave.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from lenskit.pipeline import Component

from poprox_concepts import ArticleSet
from poprox_concepts.domain import RecommendationList


class Interleave(Component):
def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleSet:
def __call__(self, recs1: RecommendationList, recs2: RecommendationList) -> RecommendationList:
articles = []
for pair in zip_longest(candidates1.articles, candidates2.articles):
for pair in zip_longest(recs1.articles, recs2.articles):
for article in pair:
if article is not None:
articles.append(article)

return ArticleSet(articles=articles)
return RecommendationList(articles=articles)
Loading

0 comments on commit 50abbea

Please sign in to comment.