diff --git a/.github/workflows/bi-transfer_pola_backend_to_bq.yml b/.github/workflows/bi-transfer_pola_backend_to_bq.yml index b8d0746164b..2f1237e464c 100644 --- a/.github/workflows/bi-transfer_pola_backend_to_bq.yml +++ b/.github/workflows/bi-transfer_pola_backend_to_bq.yml @@ -29,18 +29,17 @@ env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GCP_PROJECT_ID: pola-bi-looker - GCP_PROJECT_NUMBER: 354540873199 + GCP_PROJECT_NUMBER: "354540873199" GCP_REGION: europe-west3 GCP_BUCKET_NAME: pola-app_pola-backend_postgres_csv-files GCP_IDENTITY_POOL: github GCP_IDENTITY_PROVIDER: pola-backend-repo + HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} jobs: deploy-bi: name: "Transfer PostgresSQL to BQ" runs-on: ubuntu-latest - env: - HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} steps: - name: Set dynamic job variable shell: python @@ -88,12 +87,21 @@ jobs: export_environment_variables: true create_credentials_file: true - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v2' + uses: 'google-github-actions/setup-gcloud@v2' + - name: 'Use gcloud CLI' + run: 'gcloud info' + - name: 'Use gcloud CLI' + run: 'gcloud auth list --filter=status:ACTIVE --format="value(account)"' + - name: 'Use gcloud CLI' + run: 'gcloud auth list' + + # - name: Debug OIDC Claims + # uses: 'github/actions-oidc-debugger@main' + # with: + # audience: 'https://github.com/github' - name: Install Heroku CLI run: | curl https://cli-assets.heroku.com/install.sh | sh - env: - HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} - name: Retrieve PostgreSQL credentials run: | DATABASE_URL=$(heroku config:get DATABASE_URL --app "${HEROKU_APP}") @@ -103,37 +111,67 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.9' + - name: Copy file to bucket + run: | + set -x; + date | gcloud storage cp - "gs://${GCP_BUCKET_NAME}/date.txt" + gcloud storage cat "gs://${GCP_BUCKET_NAME}/date.txt" + gcloud storage ls "gs://${GCP_BUCKET_NAME}/" - name: "Install dependencies" run: pip install google-cloud-bigquery google-cloud-storage psycopg2-binary - name: Run transfer script env: TABLE_NAMES: | - ai_pics_aiattachment, - ai_pics_aipics, - bi_companies_by_query_group, - bi_companies_with_count_group, - bi_new_product_by_hour, - bi_popular_not_verified_products, - bi_product_by_time, - bi_queries_by_time, - bi_queries_stats_intervals, - bi_stats_queries_uq_users_by_week, - company_brand, - company_company, - gpc_brick, - gpc_class, - gpc_family, - gpc_segment, - pola_query, - pola_searchquery, - pola_stats, - product_product, - report_attachment, - report_report, - users_user + ai_pics_aiattachment run: | python ./pola-bi/postgres_to_bigquery.py --verbose all \ --database-url "${DATABASE_URL}" \ --table-names "${TABLE_NAMES}" \ --staging-url "gs://${GCP_BUCKET_NAME}/" \ - --dataset-id "${GCP_BIGQUERY_DATASET}" \ + --dataset-id "${GCP_BIGQUERY_DATASET}" + - name: Set up GoLang + uses: actions/setup-go@v3 + with: + go-version: "1.21" + cache: false + + - name: "Checkout" + uses: actions/checkout@v4 + with: + fetch-depth: 2 + repository: slingdata-io/sling-cli + path: sling + ref: bigquery-openid + + - run: | + cd sling + go mod edit -dropreplace='github.com/flarco/g' go.mod + go mod edit -dropreplace='github.com/slingdata-io/sling' go.mod + go mod edit -droprequire='github.com/slingdata-io/sling' go.mod + go mod tidy + go get -u golang.org/x/oauth2 + go get -u cloud.google.com/go + go get -u cloud.google.com/go/bigquery + go get -u cloud.google.com/go/bigtable + go get -u cloud.google.com/go/storage + go build -o /usr/local/bin/sling cmd/sling/*.go + cd .. + + - name: Run transfer script + run: | + set -x; + sling conns set MY_BIGQUERY \ + type=bigquery \ + project="${GCP_PROJECT_ID}" \ + dataset="${GCP_BIGQUERY_DATASET}" \ + gc_bucket="${GCP_BUCKET_NAME}" \ + location="${GCP_REGION}" + sling conns test MY_BIGQUERY + export "MY_POSTGRES=${DATABASE_URL}" + sling conns test MY_POSTGRES + +# TODO: Run replication +# export "GOOGLE_APPLICATION_CREDENTIALS=$(./scripts/gcloud_generate_temp_creds.sh)" +# gcloud storage ls "gs://${GCP_BUCKET_NAME}/" +# cat "$(./scripts/gcloud_generate_temp_creds.sh)" +# export "GOOGLE_APPLICATION_CREDENTIALS=$(./scripts/gcloud_generate_temp_creds.sh)" diff --git a/pola-bi/sling-data/replication.yaml b/pola-bi/sling-data/replication.yaml new file mode 100644 index 00000000000..4979a36bd65 --- /dev/null +++ b/pola-bi/sling-data/replication.yaml @@ -0,0 +1,33 @@ +source: MY_POSTGRES +target: MY_BIGQUERY + +defaults: + mode: full-refresh + object: '{target_schema}.raw__{stream_table}' + primary_key: [id] + +streams: + public.ai_pics_aiattachment: + public.ai_pics_aipics: + public.bi_*: + primary_key: [] + public.company_brand: + public.company_company: + public.gpc_*: + public.pola_query: + mode: incremental + update_key: id + public.pola_searchquery: + public.pola_stats: + public.product_product: + public.report_attachment: + public.report_report: + public.users_user: + select: ["-password"] + +env: + # Adds the _sling_loaded_at timestamp column + SLING_LOADED_AT_COLUMN: true + + # Allows create empty tables + SLING_ALLOW_EMPTY: TRUE diff --git a/pola-bi/sling-data/run_local_sling.sh b/pola-bi/sling-data/run_local_sling.sh new file mode 100755 index 00000000000..92eb393d988 --- /dev/null +++ b/pola-bi/sling-data/run_local_sling.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -euo pipefail + +export MY_POSTGRES='postgresql://pola_app:pola_app@localhost:5432/pola_app?sslmode=disable' + +sling conns set MY_BIGQUERY \ + type=bigquery \ + project=pola-bi-looker \ + dataset=pola_backend__local \ + gc_bucket=pola-app_pola-backend_postgres_csv-files \ + location=europe-west3 + +sling "${@}" diff --git a/scripts/gcloud_generate_temp_creds.sh b/scripts/gcloud_generate_temp_creds.sh new file mode 100755 index 00000000000..a130fc4cba9 --- /dev/null +++ b/scripts/gcloud_generate_temp_creds.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +GCP_CREDENTIAL_FILE="$(mktemp -t gcp-credentials-XXXXXX).json" +# Copy the credentials file to the temporary directory +# Token refreshing SHOULD works, but it is not tested. + +jq -n \ + --arg access_token "$(gcloud auth print-access-token)" \ + --arg refresh_token "$(gcloud auth print-refresh-token || true)" \ + --arg gcp_client_id "$(gcloud config get auth/client_id)" \ + --arg gcp_client_secret "$(gcloud config get auth/client_secret)" \ + '{ + type: "authorized_user", + token: $access_token, + refresh_token: $refresh_token, + client_id: $gcp_client_id, + client_secret: $gcp_client_secret +}' > "${GCP_CREDENTIAL_FILE}" + +echo "${GCP_CREDENTIAL_FILE}"