From 1ff3adaf55a5f9763f596e7f844a5646efd9547a Mon Sep 17 00:00:00 2001 From: Zachary Deziel Date: Wed, 4 Sep 2024 08:42:06 -0700 Subject: [PATCH] Updated based on review Add remote database url. Modify attribute naming convention. Remove chunking scripts. --- docs/acceptance/db.md | 4 +-- postgres/chunk_parquet.py | 18 ----------- postgres/load_parquet_chunks.sh | 55 --------------------------------- postgres/load_to_prod.sh | 2 +- 4 files changed, 3 insertions(+), 76 deletions(-) delete mode 100755 postgres/load_parquet_chunks.sh diff --git a/docs/acceptance/db.md b/docs/acceptance/db.md index a61c208..a4ef460 100644 --- a/docs/acceptance/db.md +++ b/docs/acceptance/db.md @@ -10,7 +10,7 @@ The acceptance test below provides steps to verify that the deliverable meets ou The input data is stored in Parquet format on AWS S3 (object storage), specifically in the file `space2stats_updated.parquet`. Any additional fields must be appended to this file. The Parquet file is tabular with the following columns: - `hex_id` -- `{aggregation_method[sum, mean, etc.]}_{variable_name}_{year}` +- `{variable_name}_{aggregation_method[sum, mean, etc.]}_{year}` ### Database Setup @@ -20,7 +20,7 @@ You can use a local database for this acceptance test by running the following c docker-compose up ``` -Alternatively, you can connect to a remote database, such as the Tembo database used for production. +Alternatively, you can connect to a remote database, such as the [Tembo database](reluctantly-simple-spoonbill.data-1.use1.tembo.io) used for production. ### Data Ingestion diff --git a/postgres/chunk_parquet.py b/postgres/chunk_parquet.py index 30db620..e69de29 100644 --- a/postgres/chunk_parquet.py +++ b/postgres/chunk_parquet.py @@ -1,18 +0,0 @@ -import os - -import pandas as pd - -chunk_dir = "parquet_chunks" -df = pd.read_parquet("space2stats_updated.parquet") -chunk_size = 100000 # Number of rows per chunk - -if not os.path.exists(chunk_dir): - os.mkdir(chunk_dir) - -for i in range(0, len(df), chunk_size): - chunk = df.iloc[i : i + chunk_size] - chunk.to_parquet( - os.path.join(chunk_dir, f"space2stats_part_{i // chunk_size}.parquet") - ) - -print("Parquet file split into smaller chunks.") diff --git a/postgres/load_parquet_chunks.sh b/postgres/load_parquet_chunks.sh deleted file mode 100755 index 8f79d8b..0000000 --- a/postgres/load_parquet_chunks.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - - -# Load environment variables from db.env file -if [ -f db.env ]; then - export $(cat db.env | grep -v '#' | awk '/=/ {print $1}') -fi - -# Check if required environment variables are set -if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then - echo "One or more required environment variables are missing." - exit 1 -fi - -# Directory containing the Parquet chunks -CHUNKS_DIR="parquet_chunks" - -# Name of the target table -TABLE_NAME="space2stats" - -# Flag to check if the table exists -TABLE_EXISTS=$(psql -h $DB_HOST -p $DB_PORT -d $DB_NAME -U $DB_USER -tAc "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema='public' AND table_name='$TABLE_NAME');") - -# Loop through each Parquet file in the chunks directory -for PARQUET_FILE in "$CHUNKS_DIR"/*.parquet; -do - echo "Importing $PARQUET_FILE..." - - if [ "$TABLE_EXISTS" = "t" ]; then - # Table exists, append data - ogr2ogr -f "PostgreSQL" \ - PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ - "$PARQUET_FILE" \ - -nln $TABLE_NAME \ - -append - else - # Table does not exist, create table and import data - ogr2ogr -f "PostgreSQL" \ - PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ - "$PARQUET_FILE" \ - -nln $TABLE_NAME \ - -lco SPATIAL_INDEX=NONE - - TABLE_EXISTS="t" - fi - - if [ $? -ne 0 ]; then - echo "Failed to import $PARQUET_FILE" - exit 1 - fi - - echo "Successfully imported $PARQUET_FILE" -done - -echo "All Parquet chunks have been imported." \ No newline at end of file diff --git a/postgres/load_to_prod.sh b/postgres/load_to_prod.sh index 3149e7b..9dedc00 100755 --- a/postgres/load_to_prod.sh +++ b/postgres/load_to_prod.sh @@ -17,7 +17,7 @@ CHUNKS_DIR="parquet_chunks" # Name of the target table TABLE_NAME="space2stats" -PARQUET_FILE=space2stats_updated.parquet +PARQUET_FILE=space2stats.parquet echo "Starting"