Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Java Big Data exercise #40

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 39 additions & 12 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,46 @@

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.target>11</maven.compiler.target>
<maven.compiler.source>11</maven.compiler.source>
</properties>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>11</source>
<target>11</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>

<dependencies>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-core</artifactId>
<version>0.9</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
</dependencies>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-core</artifactId>
<version>0.9</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.0-alpha5</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>2.0.0-alpha5</version>
</dependency>
</dependencies>
</project>
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This repo contains several common big data exercises.

## Setup

1. Install the JDK 7.0
1. Install the JDK 11.0
2. [Download & Install Maven](http://maven.apache.org/download.cgi)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package nearsoft.academy.bigdata.recommendation;

import java.io.*;
import java.util.Hashtable;
import java.util.zip.GZIPInputStream;
import java.util.List;
import java.util.ArrayList;

import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender;
import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity;
import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood;
import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
import org.apache.mahout.cf.taste.recommender.RecommendedItem;
import org.apache.mahout.cf.taste.recommender.UserBasedRecommender;
import org.apache.mahout.cf.taste.similarity.UserSimilarity;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;

public class MovieRecommender {
String dataPath;
int totalUsers;
int totalProducts;
int totalReviews;

// Keep track of users and movies in the dataset
Hashtable<String, Integer> users;
Hashtable<String, Integer> products2Index;

// For querying movies
Hashtable<Integer, String> index2Products;

DataModel model;
UserSimilarity similarity;
UserNeighborhood neighborhood;
UserBasedRecommender recommender;


MovieRecommender(String dataPath) throws IOException {
this.dataPath = dataPath;
this.totalUsers = 0;
this.totalProducts = 0;
this.totalReviews = 0;

this.users = new Hashtable<String, Integer>();
this.index2Products = new Hashtable<Integer, String>();
this.products2Index = new Hashtable<String, Integer>();

try {
dataPreprocess();
loadDataModel();
} catch (Exception e) {
e.printStackTrace();
}


}

public int getTotalReviews() {
return this.totalReviews;
}

public int getTotalUsers() {
return this.totalUsers;
}

public int getTotalProducts() {
return this.totalProducts;
}

/**
* Get the list of the recommendations
*/
public List<String> getRecommendationsForUser(String user) throws TasteException {
List<String> recommendations = new ArrayList<String>();

for (RecommendedItem recommendation : this.recommender.recommend(users.get(user), 3)) {
//System.out.println(recommendation.getItemID());
long rec = recommendation.getItemID();
int rec_index = (int) rec;
String rec_id = this.index2Products.get(rec_index);
recommendations.add(rec_id);

}
return recommendations;
}

/**
* Load .csv file
*/
private void loadDataModel() throws IOException, TasteException {
this.model = new FileDataModel(new File("data/movies.csv"));
this.similarity = new PearsonCorrelationSimilarity(this.model);
this.neighborhood = new ThresholdUserNeighborhood(0.1, this.similarity, this.model);
this.recommender = new GenericUserBasedRecommender(this.model, this.neighborhood, this.similarity);
}


/**
* Extract data from .gz, iterate through .txt and create a .csv
*/
private void dataPreprocess() throws IOException {
// Extract .gz and open .txt file
InputStream file = new FileInputStream(this.dataPath);
InputStream gzStream = new GZIPInputStream(file);
Reader read = new InputStreamReader(gzStream);

// Read .txt file
BufferedReader txtFile = new BufferedReader(read);

// Create .csv
BufferedWriter csvFile = new BufferedWriter(new FileWriter("data/movies.csv"));

String productId = "";
String score = "";
String userId = "";

String line = txtFile.readLine();

while (line != null) {
//System.out.println(line);

if (line.contains("product/productId")) {
productId = line.split(" ")[1];

if (this.products2Index.get(productId) == null) {
this.totalProducts++;
this.index2Products.put(this.totalProducts, productId);
this.products2Index.put(productId, this.totalProducts);
//System.out.println("Product: " + productId);
}
} else if (line.contains("review/userId:")) {
userId = line.split(" ")[1];

if (this.users.get(userId) == null) {
this.totalUsers++;
this.users.put(userId, this.totalUsers);
//System.out.println("User: " + userId);
}
} else if (line.contains("review/score:")) {
score = line.split(" ")[1];
this.totalReviews++;
//System.out.println("Review: " + score);

}


// If we have all the fields for one review
if ((userId != "") && (productId != "") && (score != "")) {
csvFile.write(
this.users.get(userId) + "," +
this.products2Index.get(productId) + "," +
score + "\n"
);
// System.out.println(
// this.users.get(userId) + ", " +
// this.products2Index.get(productId) + ", " +
// productId + ", " + ", " +
// this.totalProducts + ", " +
// score + "\n"
// );

productId = "";
score = "";
userId = "";
}

line = txtFile.readLine();
}
txtFile.close();
csvFile.close();
System.out.println("Everything is ok!");

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public class MovieRecommenderTest {
public void testDataInfo() throws IOException, TasteException {
//download movies.txt.gz from
// http://snap.stanford.edu/data/web-Movies.html
MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz");
MovieRecommender recommender = new MovieRecommender("data/movies.txt.gz");
assertEquals(7911684, recommender.getTotalReviews());
assertEquals(253059, recommender.getTotalProducts());
assertEquals(889176, recommender.getTotalUsers());
Expand Down