Fix/item (#22)

smartnews · Jul 11, 2022 · 7ed81c3 · 7ed81c3
1 parent ee1174d
commit 7ed81c3
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -68,12 +68,12 @@ Get the items' information:
 Load the evaluator to analyse the results, say, [Gini coefficient](https://en.wikipedia.org/wiki/Gini_coefficient) metric:
 ```
 >>> metrics = rs.DiversityMetrics()
->>> metrics.gini_coefficient(ratings['movieId'])
+>>> metrics.gini_coefficient(ratings['itemId'])
 >>> 0.6335616301416965
 ```
 The nested input type (`List[List[str]]`-like) is also favorable. This is especially usful to evaluate the diversity on topic-scale:
 ```
->>> metrics.gini_coefficient(movies['genres'])
+>>> metrics.gini_coefficient(items['genres'])
 >>> 0.5158655846858095
 ```
 
@@ -82,7 +82,7 @@ The nested input type (`List[List[str]]`-like) is also favorable. This is especi
 ### Draw a Lorenz curve graph for insights
 [Lorenz curve](https://en.wikipedia.org/wiki/Lorenz_curve) is a graphical representation of the distribution, the cumulative proportion of species is plotted against the cumulative proportion of individuals. This feature is also supported by **rsdiv** for helping practitioners' analysis.
 ```
-metrics.get_lorenz_curve(ratings['movieId'])
+metrics.get_lorenz_curve(ratings['itemId'])
 ```
 ![Lorenz](pics/Lorenz.png)
 
@@ -91,21 +91,24 @@ metrics.get_lorenz_curve(ratings['movieId'])
 ```
 >>> rc = rs.FMRecommender(ratings, 0.3).fit()
 ```
-30% of interactions are split for test set, the precision at top 5 can be calculated with:
+30% of interactions are split for test set, the precision at `top 5` can be calculated with:
 ```
 >>> rc.precision_at_top_k(5)
 >>> 0.14464477
 ```
-the prediction scores for a given user on each item can be access with (the results with seen items removed can be calculated by `predict_for_userId_unseen`):
+the `top 100` unseen recommended items for an arbitrary user, say `userId: 1024`, can be simply given by:
 ```
->>> rc.predict_for_userId(42)
->>> array([-3.0786333, -2.8600938, -5.5952744, ..., -5.9792733, -7.8316765, -6.2370725], dtype=float32)
-```
-the scores of top `5` recommended items for the `userId: 1024` are given by:
-```
->>> rc.predict_top_n_unseen(1024, 5)
->>> {1296: 1.7469575, 916: 1.773555, 915: 1.63063, 2067: 1.3016684, 28: 1.2860104}
+>>> rc.predict_top_n_item(1024, 100)
 ```
+
+|    |   itemId |   scores | title                                   | genres                                          |   release_date |
+|---:|------:|---------:|:-----------|:-----------|---------------:|
+|  0 |      916 | 1.77356  | Roman Holiday                           | [\'Comedy\', \'Romance\']                           |           1953 |
+|  1 |     1296 | 1.74696  | Room with a View                        | [\'Drama\', \'Romance\']                            |           1986 |
+|  ... |     ... | ...  | ...       | ...                |       ... |
+|  98 |     3079 | 0.371897  | Mansfield Park                        | [\'Drama\']                            |           1999 |
+|  99 |     2570 | 0.369199  | Walk on the Moon	                     | [\'Drama\', \'Romance\']                            |           1999 |
+
 ### Improve the diversity
 TODO.
 

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="rsdiv",
-    version="0.1.4",
+    version="0.1.5",
     author="Yin Cheng",
     author_email="[email protected]",
     long_description=LONG_DESCRIPTION,

diff --git a/src/rsdiv/dataset/movielens_100k.py b/src/rsdiv/dataset/movielens_100k.py
@@ -47,10 +47,11 @@ def read_items(self) -> pd.DataFrame:
             header=None,
             encoding="latin-1",
             engine="python",
-            names=["movieId", "title", "release_date", "video_release_date", "URL"]
+            names=["itemId", "title", "release_date", "video_release_date", "URL"]
             + genres,
         )
         df_items["title"] = df_items["title"].str[:-7]
+        df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
         df_items["release_date"] = pd.to_datetime(df_items.release_date)
         df_items["genres"] = df_items[genres].dot(df_items[genres].columns + "|")
         df_items["genres"] = df_items["genres"].apply(lambda x: x[:-1].split("|"))

diff --git a/src/rsdiv/dataset/movielens_1m.py b/src/rsdiv/dataset/movielens_1m.py
@@ -39,10 +39,11 @@ def read_items(self) -> pd.DataFrame:
             header=None,
             encoding="latin-1",
             engine="python",
-            names=["movieId", "title", "genres"],
+            names=["itemId", "title", "genres"],
         )
         df_items["release_date"] = df_items["title"].str[-5:-1].astype("int")
         df_items["title"] = df_items["title"].str[:-7]
+        df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
         df_items["genres"] = df_items["genres"].apply(lambda x: x.split("|"))
 
         return df_items
diff --git a/src/rsdiv/recommenders/base.py b/src/rsdiv/recommenders/base.py
@@ -11,19 +11,22 @@
 
 class BaseRecommender(metaclass=ABCMeta):
     df_interaction: pd.DataFrame
+    items: pd.DataFrame
     user_features: Optional[pd.DataFrame]
     item_features: Optional[pd.DataFrame]
     test_size: Optional[float]
 
     def __init__(
         self,
         df_interaction: pd.DataFrame,
+        items: pd.DataFrame,
         user_features: Optional[pd.DataFrame] = None,
         item_features: Optional[pd.DataFrame] = None,
         test_size: Optional[float] = None,
     ) -> None:
         self.n_users, self.n_items = df_interaction.max()[:2]
         self.df_interaction = self.get_interaction(df_interaction)
+        self.items = items
         self.user_features = user_features
         self.item_features = item_features
         self.test_size = test_size
@@ -90,3 +93,12 @@ def predict_top_n_unseen(self, user_id: int, top_n: int) -> Dict[int, float]:
         argpartition = np.argpartition(-prediction, top_n)
         result_args = argpartition[:top_n]
         return {key + 1: prediction[key] for key in result_args}
+
+    def predict_top_n_item(self, user_id: int, top_n: int) -> pd.DataFrame:
+        prediction = self.predict_top_n_unseen(user_id, top_n)
+        candidates: pd.DataFrame = pd.DataFrame.from_dict(prediction.items())
+        candidates.columns = ["itemId", "scores"]
+        candidates = candidates.sort_values(
+            by="scores", ascending=False, ignore_index=True
+        )
+        return candidates.merge(self.items, how="left", on="itemId")
diff --git a/src/rsdiv/recommenders/fm.py b/src/rsdiv/recommenders/fm.py
@@ -11,9 +11,12 @@
 
 class FMRecommender(BaseRecommender):
     def __init__(
-        self, interaction: pd.DataFrame, test_size: Optional[float] = None
+        self,
+        interaction: pd.DataFrame,
+        items: pd.DataFrame,
+        test_size: Optional[float] = None,
     ) -> None:
-        super().__init__(interaction, test_size)
+        super().__init__(interaction, items, test_size)
         self.fm = LightFM(
             no_components=10,
             loss="bpr",