Skip to content

Commit

Permalink
Load data from .csv files
Browse files Browse the repository at this point in the history
  • Loading branch information
df7cb committed Sep 27, 2023
1 parent 60e417e commit ad86b4a
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
omdb.dump
www.omdb.org/
www.omdb.org/data/*.bz2
73 changes: 36 additions & 37 deletions 20-import.sql
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
BEGIN;

CREATE TEMP TABLE IF NOT EXISTS all_movies (id bigint, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS all_series (id bigint, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS all_seasons (id bigint, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS all_episodes (id bigint, name text, parent_id bigint, date date, series_id bigint);
CREATE TEMP TABLE IF NOT EXISTS all_movieseries (id bigint, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS movie_details (movie_id bigint, runtime int, budget numeric, revenue numeric, homepage text);
CREATE TEMP TABLE IF NOT EXISTS votes (movie_id bigint, vote_average numeric, votes_count bigint);
CREATE TEMP TABLE IF NOT EXISTS all_movies (id bigint primary key, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS all_series (id bigint primary key, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS all_seasons (id bigint primary key, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS all_episodes (id bigint primary key, name text, parent_id bigint, date date, series_id bigint);
CREATE TEMP TABLE IF NOT EXISTS all_movieseries (id bigint primary key, name text, parent_id bigint, date date);
CREATE TEMP TABLE IF NOT EXISTS movie_details (movie_id bigint primary key, runtime int, budget numeric, revenue numeric, homepage text);
CREATE TEMP TABLE IF NOT EXISTS votes (movie_id bigint primary key, vote_average numeric, votes_count bigint);

\copy all_movies FROM PROGRAM 'bzcat www.omdb.org/data/all_movies.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_series FROM PROGRAM 'bzcat www.omdb.org/data/all_series.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_seasons FROM PROGRAM 'bzcat www.omdb.org/data/all_seasons.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_episodes FROM PROGRAM 'bzcat www.omdb.org/data/all_episodes.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_movieseries FROM PROGRAM 'bzcat www.omdb.org/data/all_movieseries.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_details FROM PROGRAM 'bzcat www.omdb.org/data/movie_details.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy votes FROM PROGRAM 'bzcat www.omdb.org/data/all_votes.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_movies FROM 'www.omdb.org/data/all_movies.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_series FROM 'www.omdb.org/data/all_series.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_seasons FROM 'www.omdb.org/data/all_seasons.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_episodes FROM 'www.omdb.org/data/all_episodes.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy all_movieseries FROM 'www.omdb.org/data/all_movieseries.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_details FROM 'www.omdb.org/data/movie_details.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy votes FROM 'www.omdb.org/data/all_votes.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
WITH import_movies AS (
SELECT id, name, parent_id, date, NULL::bigint AS series_id, 'movie'::kind AS kind FROM all_movies
Expand All @@ -35,29 +35,28 @@ FROM import_movies m
LEFT JOIN movie_details d ON m.id = d.movie_id
LEFT JOIN votes v ON m.id = v.movie_id);
\copy people FROM PROGRAM 'bzcat www.omdb.org/data/all_people.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy people_aliases FROM PROGRAM 'bzcat www.omdb.org/data/all_people_aliases.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
-- people_links contains duplicates (2016-11-07)
\copy people_links FROM PROGRAM 'bzcat www.omdb.org/data/people_links.csv.bz2 | tail -n +2 | sort -u' WITH (FORMAT CSV, HEADER FALSE, NULL '\N', ESCAPE '\')
\copy casts FROM PROGRAM 'bzcat www.omdb.org/data/all_casts.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy job_names FROM PROGRAM 'bzcat www.omdb.org/data/job_names.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy people FROM 'www.omdb.org/data/all_people.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy people_aliases FROM 'www.omdb.org/data/all_people_aliases.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy people_links FROM 'www.omdb.org/data/people_links.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy casts FROM 'www.omdb.org/data/all_casts.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy job_names FROM 'www.omdb.org/data/job_names.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
INSERT INTO jobs SELECT job_id, name FROM job_names WHERE language = 'en';
--\copy characters FROM PROGRAM 'bzcat www.omdb.org/data/all_characters.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_categories FROM PROGRAM 'bzcat www.omdb.org/data/movie_categories.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_keywords FROM PROGRAM 'bzcat www.omdb.org/data/movie_keywords.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy category_names FROM PROGRAM 'bzcat www.omdb.org/data/category_names.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy categories (id, parent_id, root_id) FROM PROGRAM 'bzcat www.omdb.org/data/all_categories.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy trailers FROM PROGRAM 'bzcat www.omdb.org/data/trailers.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_links FROM PROGRAM 'bzcat www.omdb.org/data/movie_links.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy image_ids FROM PROGRAM 'bzcat www.omdb.org/data/image_ids.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy image_licenses FROM PROGRAM 'bzcat www.omdb.org/data/image_licenses.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_aliases_iso FROM PROGRAM 'bzcat www.omdb.org/data/all_movie_aliases_iso.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_languages FROM PROGRAM 'bzcat www.omdb.org/data/movie_languages.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_countries FROM PROGRAM 'bzcat www.omdb.org/data/movie_countries.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_references FROM PROGRAM 'bzcat www.omdb.org/data/movie_references.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_de FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_de.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_en FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_en.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_fr FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_fr.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_es FROM PROGRAM 'bzcat www.omdb.org/data/movie_abstracts_es.csv.bz2' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
--\copy characters FROM 'www.omdb.org/data/all_characters.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_categories FROM 'www.omdb.org/data/movie_categories.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_keywords FROM 'www.omdb.org/data/movie_keywords.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy category_names FROM 'www.omdb.org/data/category_names.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy categories (id, parent_id, root_id) FROM 'www.omdb.org/data/all_categories.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy trailers FROM 'www.omdb.org/data/trailers.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_links FROM 'www.omdb.org/data/movie_links.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy image_ids FROM 'www.omdb.org/data/image_ids.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy image_licenses FROM 'www.omdb.org/data/image_licenses.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_aliases_iso FROM 'www.omdb.org/data/all_movie_aliases_iso.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_languages FROM 'www.omdb.org/data/movie_languages.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_countries FROM 'www.omdb.org/data/movie_countries.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_references FROM 'www.omdb.org/data/movie_references.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_de FROM 'www.omdb.org/data/movie_abstracts_de.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_en FROM 'www.omdb.org/data/movie_abstracts_en.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_fr FROM 'www.omdb.org/data/movie_abstracts_fr.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
\copy movie_abstracts_es FROM 'www.omdb.org/data/movie_abstracts_es.csv' WITH (FORMAT CSV, HEADER TRUE, NULL '\N', ESCAPE '\')
COMMIT;
15 changes: 10 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
# using PG version with archive format compatible with older releases
PGVERSION = 11
# use oldest pg_dump available (>= 10) so the dump format is most compatible
PG_DUMP = $(firstword $(shell ls -v /usr/lib/postgresql/[123]*/bin/pg_dump /usr/pgsql-[123]*/bin/pg_dump 2> /dev/null))
PGVERSION0 = $(patsubst /usr/lib/postgresql/%/bin/pg_dump,%,$(PG_DUMP))
PGVERSION = $(patsubst /usr/pgsql-%/bin/pg_dump,%,$(PGVERSION0))
PGUSER = postgres

dump: omdb.dump

omdb.dump: www.omdb.org/data/all_movies.csv.bz2
pg_virtualenv -i '--auth=trust --username=$(PGUSER)' -v $(PGVERSION) sh -c "export PGUSER=$(PGUSER) && ./import && PATH=/usr/lib/postgresql/$(PGVERSION)/bin:/usr/pgsql-$(PGVERSION)/bin:$(PATH) pg_dump -Fc -f $@ omdb"
omdb.dump: www.omdb.org/data/all_movies.csv
pg_virtualenv -i '--auth=trust --username=$(PGUSER)' -v $(PGVERSION) \
sh -c "export PGUSER=$(PGUSER) && \
./import && \
$(PG_DUMP) -Fc -f $@ omdb"

www.omdb.org/data/all_movies.csv.bz2:
./download

clean:
rm -rf omdb.dump www.omdb.org
rm -f omdb.dump www.omdb.org/data/*.bz2
2 changes: 2 additions & 0 deletions download
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,5 @@ movie_content_updates
for f in $FILES; do
wget --no-verbose --mirror "$BASE$f.csv.bz2"
done

bunzip2 --keep --force www.omdb.org/data/*.bz2

0 comments on commit ad86b4a

Please sign in to comment.