From ad86b4ad1f33d0a5466d56c2a7cc3b5239d98cc6 Mon Sep 17 00:00:00 2001 From: Christoph Berg Date: Wed, 27 Sep 2023 13:08:09 +0200 Subject: [PATCH] Load data from .csv files --- .gitignore | 2 +- 20-import.sql | 73 +++++++++++++++++++++++++-------------------------- Makefile | 15 +++++++---- download | 2 ++ 4 files changed, 49 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index 9800b04..0e4095e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ omdb.dump -www.omdb.org/ +www.omdb.org/data/*.bz2 diff --git a/20-import.sql b/20-import.sql index 4ed8b79..de33c81 100644 --- a/20-import.sql +++ b/20-import.sql @@ -1,20 +1,20 @@ BEGIN; -CREATE TEMP TABLE IF NOT EXISTS all_movies (id bigint, name text, parent_id bigint, date date); -CREATE TEMP TABLE IF NOT EXISTS all_series (id bigint, name text, parent_id bigint, date date); -CREATE TEMP TABLE IF NOT EXISTS all_seasons (id bigint, name text, parent_id bigint, date date); -CREATE TEMP TABLE IF NOT EXISTS all_episodes (id bigint, name text, parent_id bigint, date date, series_id bigint); -CREATE TEMP TABLE IF NOT EXISTS all_movieseries (id bigint, name text, parent_id bigint, date date); -CREATE TEMP TABLE IF NOT EXISTS movie_details (movie_id bigint, runtime int, budget numeric, revenue numeric, homepage text); -CREATE TEMP TABLE IF NOT EXISTS votes (movie_id bigint, vote_average numeric, votes_count bigint); +CREATE TEMP TABLE IF NOT EXISTS all_movies (id bigint primary key, name text, parent_id bigint, date date); +CREATE TEMP TABLE IF NOT EXISTS all_series (id bigint primary key, name text, parent_id bigint, date date); +CREATE TEMP TABLE IF NOT EXISTS all_seasons (id bigint primary key, name text, parent_id bigint, date date); +CREATE TEMP TABLE IF NOT EXISTS all_episodes (id bigint primary key, name text, parent_id bigint, date date, series_id bigint); +CREATE TEMP TABLE IF NOT EXISTS all_movieseries (id bigint primary key, name text, parent_id bigint, date date); +CREATE TEMP TABLE IF NOT EXISTS movie_details (movie_id bigint primary key, runtime int, budget numeric, revenue numeric, homepage text); +CREATE TEMP TABLE IF NOT EXISTS votes (movie_id bigint primary key, vote_average numeric, votes_count bigint); -\copy all_movies FROM PROGRAM 'bzcat www.omdb.org/data/all_movies.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy all_series FROM PROGRAM 'bzcat www.omdb.org/data/all_series.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy all_seasons FROM PROGRAM 'bzcat www.omdb.org/data/all_seasons.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy all_episodes FROM PROGRAM 'bzcat www.omdb.org/data/all_episodes.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy all_movieseries FROM PROGRAM 'bzcat www.omdb.org/data/all_movieseries.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_details FROM PROGRAM 'bzcat www.omdb.org/data/movie_details.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy votes FROM PROGRAM 'bzcat www.omdb.org/data/all_votes.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy all_movies FROM 'www.omdb.org/data/all_movies.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy all_series FROM 'www.omdb.org/data/all_series.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy all_seasons FROM 'www.omdb.org/data/all_seasons.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy all_episodes FROM 'www.omdb.org/data/all_episodes.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy all_movieseries FROM 'www.omdb.org/data/all_movieseries.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_details FROM 'www.omdb.org/data/movie_details.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy votes FROM 'www.omdb.org/data/all_votes.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') WITH import_movies AS ( SELECT id, name, parent_id, date, NULL::bigint AS series_id, 'movie'::kind AS kind FROM all_movies @@ -35,29 +35,28 @@ FROM import_movies m LEFT JOIN movie_details d ON m.id = d.movie_id LEFT JOIN votes v ON m.id = v.movie_id); -\copy people FROM PROGRAM 'bzcat www.omdb.org/data/all_people.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy people_aliases FROM PROGRAM 'bzcat www.omdb.org/data/all_people_aliases.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') --- people_links contains duplicates (2016-11-07) -\copy people_links FROM PROGRAM 'bzcat www.omdb.org/data/people_links.csv.bz2 | tail -n +2 | sort -u' WITH (FORMAT CSV, HEADER FALSE, NULL '\N', ESCAPE '\') -\copy casts FROM PROGRAM 'bzcat www.omdb.org/data/all_casts.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy job_names FROM PROGRAM 'bzcat www.omdb.org/data/job_names.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy people FROM 'www.omdb.org/data/all_people.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy people_aliases FROM 'www.omdb.org/data/all_people_aliases.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy people_links FROM 'www.omdb.org/data/people_links.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy casts FROM 'www.omdb.org/data/all_casts.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy job_names FROM 'www.omdb.org/data/job_names.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') INSERT INTO jobs SELECT job_id, name FROM job_names WHERE language = 'en'; ---\copy characters FROM PROGRAM 'bzcat www.omdb.org/data/all_characters.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_categories FROM PROGRAM 'bzcat www.omdb.org/data/movie_categories.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_keywords FROM PROGRAM 'bzcat www.omdb.org/data/movie_keywords.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy category_names FROM PROGRAM 'bzcat www.omdb.org/data/category_names.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy categories (id, parent_id, root_id) FROM PROGRAM 'bzcat www.omdb.org/data/all_categories.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy trailers FROM PROGRAM 'bzcat www.omdb.org/data/trailers.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_links FROM PROGRAM 'bzcat www.omdb.org/data/movie_links.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy image_ids FROM PROGRAM 'bzcat www.omdb.org/data/image_ids.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy image_licenses FROM PROGRAM 'bzcat www.omdb.org/data/image_licenses.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_aliases_iso FROM PROGRAM 'bzcat www.omdb.org/data/all_movie_aliases_iso.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_languages FROM PROGRAM 'bzcat www.omdb.org/data/movie_languages.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_countries FROM PROGRAM 'bzcat www.omdb.org/data/movie_countries.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_references FROM PROGRAM 'bzcat www.omdb.org/data/movie_references.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_abstracts_de FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_de.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_abstracts_en FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_en.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_abstracts_fr FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_fr.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') -\copy movie_abstracts_es FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_es.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +--\copy characters FROM 'www.omdb.org/data/all_characters.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_categories FROM 'www.omdb.org/data/movie_categories.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_keywords FROM 'www.omdb.org/data/movie_keywords.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy category_names FROM 'www.omdb.org/data/category_names.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy categories (id, parent_id, root_id) FROM 'www.omdb.org/data/all_categories.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy trailers FROM 'www.omdb.org/data/trailers.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_links FROM 'www.omdb.org/data/movie_links.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy image_ids FROM 'www.omdb.org/data/image_ids.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy image_licenses FROM 'www.omdb.org/data/image_licenses.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_aliases_iso FROM 'www.omdb.org/data/all_movie_aliases_iso.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_languages FROM 'www.omdb.org/data/movie_languages.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_countries FROM 'www.omdb.org/data/movie_countries.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_references FROM 'www.omdb.org/data/movie_references.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_abstracts_de FROM 'www.omdb.org/data/movie_abstracts_de.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_abstracts_en FROM 'www.omdb.org/data/movie_abstracts_en.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_abstracts_fr FROM 'www.omdb.org/data/movie_abstracts_fr.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') +\copy movie_abstracts_es FROM 'www.omdb.org/data/movie_abstracts_es.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\') COMMIT; diff --git a/Makefile b/Makefile index b0d5347..5f40b7b 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,19 @@ -# using PG version with archive format compatible with older releases -PGVERSION = 11 +# use oldest pg_dump available (>= 10) so the dump format is most compatible +PG_DUMP = $(firstword $(shell ls -v /usr/lib/postgresql/[123]*/bin/pg_dump /usr/pgsql-[123]*/bin/pg_dump 2> /dev/null)) +PGVERSION0 = $(patsubst /usr/lib/postgresql/%/bin/pg_dump,%,$(PG_DUMP)) +PGVERSION = $(patsubst /usr/pgsql-%/bin/pg_dump,%,$(PGVERSION0)) PGUSER = postgres dump: omdb.dump -omdb.dump: www.omdb.org/data/all_movies.csv.bz2 - pg_virtualenv -i '--auth=trust --username=$(PGUSER)' -v $(PGVERSION) sh -c "export PGUSER=$(PGUSER) && ./import && PATH=/usr/lib/postgresql/$(PGVERSION)/bin:/usr/pgsql-$(PGVERSION)/bin:$(PATH) pg_dump -Fc -f $@ omdb" +omdb.dump: www.omdb.org/data/all_movies.csv + pg_virtualenv -i '--auth=trust --username=$(PGUSER)' -v $(PGVERSION) \ + sh -c "export PGUSER=$(PGUSER) && \ + ./import && \ + $(PG_DUMP) -Fc -f $@ omdb" www.omdb.org/data/all_movies.csv.bz2: ./download clean: - rm -rf omdb.dump www.omdb.org + rm -f omdb.dump www.omdb.org/data/*.bz2 diff --git a/download b/download index 20342f8..c0a6820 100755 --- a/download +++ b/download @@ -53,3 +53,5 @@ movie_content_updates for f in $FILES; do wget --no-verbose --mirror "$BASE$f.csv.bz2" done + +bunzip2 --keep --force www.omdb.org/data/*.bz2