Skip to content

Commit

Permalink
Fix/item (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
yinsn authored Jul 11, 2022
1 parent ee1174d commit 7ed81c3
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 17 deletions.
27 changes: 15 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ Get the items' information:
Load the evaluator to analyse the results, say, [Gini coefficient](https://en.wikipedia.org/wiki/Gini_coefficient) metric:
```
>>> metrics = rs.DiversityMetrics()
>>> metrics.gini_coefficient(ratings['movieId'])
>>> metrics.gini_coefficient(ratings['itemId'])
>>> 0.6335616301416965
```
The nested input type (`List[List[str]]`-like) is also favorable. This is especially usful to evaluate the diversity on topic-scale:
```
>>> metrics.gini_coefficient(movies['genres'])
>>> metrics.gini_coefficient(items['genres'])
>>> 0.5158655846858095
```

Expand All @@ -82,7 +82,7 @@ The nested input type (`List[List[str]]`-like) is also favorable. This is especi
### Draw a Lorenz curve graph for insights
[Lorenz curve](https://en.wikipedia.org/wiki/Lorenz_curve) is a graphical representation of the distribution, the cumulative proportion of species is plotted against the cumulative proportion of individuals. This feature is also supported by **rsdiv** for helping practitioners' analysis.
```
metrics.get_lorenz_curve(ratings['movieId'])
metrics.get_lorenz_curve(ratings['itemId'])
```
![Lorenz](pics/Lorenz.png)

Expand All @@ -91,21 +91,24 @@ metrics.get_lorenz_curve(ratings['movieId'])
```
>>> rc = rs.FMRecommender(ratings, 0.3).fit()
```
30% of interactions are split for test set, the precision at top 5 can be calculated with:
30% of interactions are split for test set, the precision at `top 5` can be calculated with:
```
>>> rc.precision_at_top_k(5)
>>> 0.14464477
```
the prediction scores for a given user on each item can be access with (the results with seen items removed can be calculated by `predict_for_userId_unseen`):
the `top 100` unseen recommended items for an arbitrary user, say `userId: 1024`, can be simply given by:
```
>>> rc.predict_for_userId(42)
>>> array([-3.0786333, -2.8600938, -5.5952744, ..., -5.9792733, -7.8316765, -6.2370725], dtype=float32)
```
the scores of top `5` recommended items for the `userId: 1024` are given by:
```
>>> rc.predict_top_n_unseen(1024, 5)
>>> {1296: 1.7469575, 916: 1.773555, 915: 1.63063, 2067: 1.3016684, 28: 1.2860104}
>>> rc.predict_top_n_item(1024, 100)
```

| | itemId | scores | title | genres | release_date |
|---:|------:|---------:|:-----------|:-----------|---------------:|
| 0 | 916 | 1.77356 | Roman Holiday | [\'Comedy\', \'Romance\'] | 1953 |
| 1 | 1296 | 1.74696 | Room with a View | [\'Drama\', \'Romance\'] | 1986 |
| ... | ... | ... | ... | ... | ... |
| 98 | 3079 | 0.371897 | Mansfield Park | [\'Drama\'] | 1999 |
| 99 | 2570 | 0.369199 | Walk on the Moon | [\'Drama\', \'Romance\'] | 1999 |

### Improve the diversity
TODO.

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="rsdiv",
version="0.1.4",
version="0.1.5",
author="Yin Cheng",
author_email="[email protected]",
long_description=LONG_DESCRIPTION,
Expand Down
3 changes: 2 additions & 1 deletion src/rsdiv/dataset/movielens_100k.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ def read_items(self) -> pd.DataFrame:
header=None,
encoding="latin-1",
engine="python",
names=["movieId", "title", "release_date", "video_release_date", "URL"]
names=["itemId", "title", "release_date", "video_release_date", "URL"]
+ genres,
)
df_items["title"] = df_items["title"].str[:-7]
df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
df_items["release_date"] = pd.to_datetime(df_items.release_date)
df_items["genres"] = df_items[genres].dot(df_items[genres].columns + "|")
df_items["genres"] = df_items["genres"].apply(lambda x: x[:-1].split("|"))
Expand Down
3 changes: 2 additions & 1 deletion src/rsdiv/dataset/movielens_1m.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@ def read_items(self) -> pd.DataFrame:
header=None,
encoding="latin-1",
engine="python",
names=["movieId", "title", "genres"],
names=["itemId", "title", "genres"],
)
df_items["release_date"] = df_items["title"].str[-5:-1].astype("int")
df_items["title"] = df_items["title"].str[:-7]
df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
df_items["genres"] = df_items["genres"].apply(lambda x: x.split("|"))

return df_items
12 changes: 12 additions & 0 deletions src/rsdiv/recommenders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@

class BaseRecommender(metaclass=ABCMeta):
df_interaction: pd.DataFrame
items: pd.DataFrame
user_features: Optional[pd.DataFrame]
item_features: Optional[pd.DataFrame]
test_size: Optional[float]

def __init__(
self,
df_interaction: pd.DataFrame,
items: pd.DataFrame,
user_features: Optional[pd.DataFrame] = None,
item_features: Optional[pd.DataFrame] = None,
test_size: Optional[float] = None,
) -> None:
self.n_users, self.n_items = df_interaction.max()[:2]
self.df_interaction = self.get_interaction(df_interaction)
self.items = items
self.user_features = user_features
self.item_features = item_features
self.test_size = test_size
Expand Down Expand Up @@ -90,3 +93,12 @@ def predict_top_n_unseen(self, user_id: int, top_n: int) -> Dict[int, float]:
argpartition = np.argpartition(-prediction, top_n)
result_args = argpartition[:top_n]
return {key + 1: prediction[key] for key in result_args}

def predict_top_n_item(self, user_id: int, top_n: int) -> pd.DataFrame:
prediction = self.predict_top_n_unseen(user_id, top_n)
candidates: pd.DataFrame = pd.DataFrame.from_dict(prediction.items())
candidates.columns = ["itemId", "scores"]
candidates = candidates.sort_values(
by="scores", ascending=False, ignore_index=True
)
return candidates.merge(self.items, how="left", on="itemId")
7 changes: 5 additions & 2 deletions src/rsdiv/recommenders/fm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@

class FMRecommender(BaseRecommender):
def __init__(
self, interaction: pd.DataFrame, test_size: Optional[float] = None
self,
interaction: pd.DataFrame,
items: pd.DataFrame,
test_size: Optional[float] = None,
) -> None:
super().__init__(interaction, test_size)
super().__init__(interaction, items, test_size)
self.fm = LightFM(
no_components=10,
loss="bpr",
Expand Down

0 comments on commit 7ed81c3

Please sign in to comment.