|
| 1 | +#find top 3 movie based on the rating |
| 2 | + |
| 3 | +from pyspark.sql import SparkSession |
| 4 | +from pyspark.sql.functions import avg |
| 5 | + |
| 6 | +# Initialize Spark Session |
| 7 | +#spark = SparkSession.builder.appName("TopMovies").getOrCreate() |
| 8 | + |
| 9 | +# Sample DataFrames |
| 10 | +data_movies = [(1, "Movie A"), (2, "Movie B"), (3, "Movie C"), (4, "Movie D"), (5, "Movie E")] |
| 11 | + |
| 12 | +data_ratings = [(1, 101, 4.5), (1, 102, 4.0), (2, 103, 5.0), |
| 13 | + (2, 104, 3.5), (3, 105, 4.0), (3, 106, 4.0), |
| 14 | + (4, 107, 3.0), (5, 108, 2.5), (5, 109, 3.0)] |
| 15 | + |
| 16 | +columns_movies = ["MovieID", "MovieName"] |
| 17 | +columns_ratings = ["MovieID", "UserID", "Rating"] |
| 18 | + |
| 19 | +# Creating DataFrames |
| 20 | +df_movies = spark.createDataFrame(data_movies, columns_movies) |
| 21 | +df_ratings = spark.createDataFrame(data_ratings, columns_ratings) |
| 22 | + |
| 23 | +# Calculating average ratings |
| 24 | +avg_ratings = df_ratings.groupBy('MovieID').agg(avg('Rating').alias('AvgRating')) |
| 25 | + |
| 26 | +# Joining with df_movies to get movie names |
| 27 | +top_movies = avg_ratings.join(df_movies, 'MovieID').orderBy('AvgRating', ascending=False).limit(3) |
| 28 | + |
| 29 | +# Showing the top 3 movies |
| 30 | +top_movies.show() |
0 commit comments