From fd9d421c849e203271dbf784e8b97418d4949c17 Mon Sep 17 00:00:00 2001 From: ximenasandoval Date: Sat, 23 Oct 2021 21:55:55 -0700 Subject: [PATCH 1/3] :sparkles: Adding recommendator --- pom.xml | 51 ++++-- .../recommendation/MovieRecommender.java | 169 ++++++++++++++++++ 2 files changed, 208 insertions(+), 12 deletions(-) create mode 100644 src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java diff --git a/pom.xml b/pom.xml index 8169ff7..ebb2a48 100644 --- a/pom.xml +++ b/pom.xml @@ -12,19 +12,46 @@ UTF-8 + 11 + 11 + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 11 + 11 + UTF-8 + + + + + - - org.apache.mahout - mahout-core - 0.9 - - - junit - junit - 4.7 - test - - + + org.apache.mahout + mahout-core + 0.9 + + + junit + junit + 4.7 + test + + + org.slf4j + slf4j-api + 2.0.0-alpha5 + + + org.slf4j + slf4j-simple + 2.0.0-alpha5 + + diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java new file mode 100644 index 0000000..e3d2d1d --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -0,0 +1,169 @@ +package nearsoft.academy.bigdata.recommendation; + +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; + +import java.io.*; +import java.util.Hashtable; +import java.util.zip.GZIPInputStream; +import java.util.List; +import java.util.ArrayList; + + + +public class MovieRecommender { + String dataPath; + int totalUsers; + int totalProducts; + int totalReviews; + + Hashtable users; + Hashtable Index2Products; + Hashtable products2Index; + + DataModel model; + UserSimilarity similarity; + UserNeighborhood neighborhood; + UserBasedRecommender recommender; + + + MovieRecommender(String dataPath) throws IOException { + this.dataPath = dataPath; + this.totalUsers = 0; + this.totalProducts = 0; + this.totalReviews = 0; + + this.users = new Hashtable(); + this.Index2Products = new Hashtable(); + this.products2Index = new Hashtable(); + + try { + dataPreprocess(); + loadData(); + } catch (IOException e) { + e.printStackTrace(); + } + + + } + + public int getTotalReviews() { + return this.totalReviews; + } + + public int getTotalUsers() { + return this.totalUsers; + } + + public int getTotalProducts() { + return this.totalProducts; + } + + public List getRecommendationsForUser(String user) throws TasteException { + this.similarity = new PearsonCorrelationSimilarity(this.model); + this.neighborhood = new ThresholdUserNeighborhood(0.1, this.similarity, this.model); + this.recommender = new GenericUserBasedRecommender(this.model, this.neighborhood, this.similarity); + List recommendations = new ArrayList(); + + for (RecommendedItem recommendation : recommender.recommend(users.get(user), 3)) { + //System.out.println(recommendation.getItemID()); + recommendations.add(this.Index2Products.get((int)(recommendation.getItemID()))); + //recommendations.add(this.Index2Products.get((int)(recommendation.getItemID()))); + + } + return recommendations; + } + + /** + * Load .csv file + */ + private void loadData() throws IOException { + this.model = new FileDataModel(new File("data/movies.csv")); + } + + /** + * Extract data from .gz, iterate through .txt and create a .csv + */ + private void dataPreprocess() throws IOException { + // Extract .gz and open .txt file + InputStream file = new FileInputStream(this.dataPath); + InputStream gzStream = new GZIPInputStream(file); + Reader read = new InputStreamReader(gzStream); + + // Read .txt file + BufferedReader txtFile = new BufferedReader(read); + + // Create .csv + BufferedWriter csvFile = new BufferedWriter(new FileWriter("data/movies.csv")); + + String line = txtFile.readLine(); + + String productId = ""; + String score = ""; + String userId = ""; + + while (line != null) { + //System.out.println(line); + + if (line.contains("product/productId")) { + productId = line.split(" ")[1]; + + if (this.products2Index.get(productId) == null) { + this.totalProducts++; + this.Index2Products.put(this.totalProducts, productId); + this.products2Index.put(productId, this.totalProducts); + //System.out.println("Product: " + productId); + } + } else if (line.contains("review/userId:")) { + userId = line.split(" ")[1]; + + if (this.users.get(userId) == null) { + this.totalUsers++; + this.users.put(userId, this.totalUsers); + //System.out.println("User: " + userId); + } + } else if (line.contains("review/score:")) { + score = line.split(" ")[1]; + this.totalReviews++; + //System.out.println("Review: " + score); + + } + + + + if ((userId != "") && (productId != "") && (score != "")) { + csvFile.write( + this.users.get(userId) + "," + + this.products2Index.get(productId) + "," + + score + "\n" + ); + // System.out.println( + // this.users.get(userId) + ", " + + // this.products2Index.get(productId) + ", " + + // productId + ", " + ", " + + // this.totalProducts + ", " + + // score + "\n" + // ); + + productId = ""; + score = ""; + userId = ""; + + } + + line = txtFile.readLine(); + } + txtFile.close(); + csvFile.close(); + System.out.println("Everything is ok!"); + + } +} \ No newline at end of file From 57b68bcb7274a7f2549d3ccd1105e05b34beb6c7 Mon Sep 17 00:00:00 2001 From: ximenasandoval Date: Mon, 25 Oct 2021 13:37:33 -0700 Subject: [PATCH 2/3] :sparkles: Cleaning things --- readme.md | 2 +- .../academy/bigdata/recommendation/MovieRecommenderTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index ce4dc89..4397f27 100644 --- a/readme.md +++ b/readme.md @@ -9,7 +9,7 @@ This repo contains several common big data exercises. ## Setup -1. Install the JDK 7.0 +1. Install the JDK 11.0 2. [Download & Install Maven](http://maven.apache.org/download.cgi) diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java index 0d0b1fe..60c6e10 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java @@ -15,7 +15,7 @@ public class MovieRecommenderTest { public void testDataInfo() throws IOException, TasteException { //download movies.txt.gz from // http://snap.stanford.edu/data/web-Movies.html - MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz"); + MovieRecommender recommender = new MovieRecommender("data/movies.txt.gz"); assertEquals(7911684, recommender.getTotalReviews()); assertEquals(253059, recommender.getTotalProducts()); assertEquals(889176, recommender.getTotalUsers()); From 9e4e007b821ca199eaf0054f0fda0a5cc78c97fa Mon Sep 17 00:00:00 2001 From: ximenasandoval Date: Mon, 25 Oct 2021 17:13:02 -0700 Subject: [PATCH 3/3] :memo: Cleaning some more --- .../recommendation/MovieRecommender.java | 56 ++++++++++--------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java index e3d2d1d..8b51868 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -1,5 +1,11 @@ package nearsoft.academy.bigdata.recommendation; +import java.io.*; +import java.util.Hashtable; +import java.util.zip.GZIPInputStream; +import java.util.List; +import java.util.ArrayList; + import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; @@ -11,24 +17,19 @@ import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; -import java.io.*; -import java.util.Hashtable; -import java.util.zip.GZIPInputStream; -import java.util.List; -import java.util.ArrayList; - - - public class MovieRecommender { String dataPath; int totalUsers; int totalProducts; int totalReviews; + // Keep track of users and movies in the dataset Hashtable users; - Hashtable Index2Products; Hashtable products2Index; + // For querying movies + Hashtable index2Products; + DataModel model; UserSimilarity similarity; UserNeighborhood neighborhood; @@ -42,13 +43,13 @@ public class MovieRecommender { this.totalReviews = 0; this.users = new Hashtable(); - this.Index2Products = new Hashtable(); + this.index2Products = new Hashtable(); this.products2Index = new Hashtable(); try { dataPreprocess(); - loadData(); - } catch (IOException e) { + loadDataModel(); + } catch (Exception e) { e.printStackTrace(); } @@ -67,16 +68,18 @@ public int getTotalProducts() { return this.totalProducts; } - public List getRecommendationsForUser(String user) throws TasteException { - this.similarity = new PearsonCorrelationSimilarity(this.model); - this.neighborhood = new ThresholdUserNeighborhood(0.1, this.similarity, this.model); - this.recommender = new GenericUserBasedRecommender(this.model, this.neighborhood, this.similarity); + /** + * Get the list of the recommendations + */ + public List getRecommendationsForUser(String user) throws TasteException { List recommendations = new ArrayList(); - for (RecommendedItem recommendation : recommender.recommend(users.get(user), 3)) { + for (RecommendedItem recommendation : this.recommender.recommend(users.get(user), 3)) { //System.out.println(recommendation.getItemID()); - recommendations.add(this.Index2Products.get((int)(recommendation.getItemID()))); - //recommendations.add(this.Index2Products.get((int)(recommendation.getItemID()))); + long rec = recommendation.getItemID(); + int rec_index = (int) rec; + String rec_id = this.index2Products.get(rec_index); + recommendations.add(rec_id); } return recommendations; @@ -85,9 +88,13 @@ public List getRecommendationsForUser(String user) throws TasteException /** * Load .csv file */ - private void loadData() throws IOException { + private void loadDataModel() throws IOException, TasteException { this.model = new FileDataModel(new File("data/movies.csv")); + this.similarity = new PearsonCorrelationSimilarity(this.model); + this.neighborhood = new ThresholdUserNeighborhood(0.1, this.similarity, this.model); + this.recommender = new GenericUserBasedRecommender(this.model, this.neighborhood, this.similarity); } + /** * Extract data from .gz, iterate through .txt and create a .csv @@ -104,12 +111,12 @@ private void dataPreprocess() throws IOException { // Create .csv BufferedWriter csvFile = new BufferedWriter(new FileWriter("data/movies.csv")); - String line = txtFile.readLine(); - String productId = ""; String score = ""; String userId = ""; + String line = txtFile.readLine(); + while (line != null) { //System.out.println(line); @@ -118,7 +125,7 @@ private void dataPreprocess() throws IOException { if (this.products2Index.get(productId) == null) { this.totalProducts++; - this.Index2Products.put(this.totalProducts, productId); + this.index2Products.put(this.totalProducts, productId); this.products2Index.put(productId, this.totalProducts); //System.out.println("Product: " + productId); } @@ -138,7 +145,7 @@ private void dataPreprocess() throws IOException { } - + // If we have all the fields for one review if ((userId != "") && (productId != "") && (score != "")) { csvFile.write( this.users.get(userId) + "," + @@ -156,7 +163,6 @@ private void dataPreprocess() throws IOException { productId = ""; score = ""; userId = ""; - } line = txtFile.readLine();