Skip to content

Merge pull request #413 from AtomikLabs/412-bug-author-names-are-dupl… #1032

Merge pull request #413 from AtomikLabs/412-bug-author-names-are-dupl…

Merge pull request #413 from AtomikLabs/412-bug-author-names-are-dupl… #1032

Workflow file for this run

name: Build
on:
push:
branches:
- main
- stage
- dev
paths:
- "infra/core/**"
- ".github/workflows/infra.yaml"
- "orchestration/**"
pull_request:
branches:
- dev
- test
paths:
- "infra/core/**"
- ".github/workflows/infra.yaml"
- "orchestration/**"
workflow_dispatch:
jobs:
create_infra:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install jq
run: sudo apt-get install jq
- name: Install node and npm
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install NPM Dependencies
run: |
npm install
- name: Load Environment Variables
uses: ./.github/actions/load-env-variables
- name: Extract variables
id: vars
run: |
echo "app_name=$(grep '^app_name' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "aws_region=$(grep '^aws_region' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "backend_dynamodb_table=$(grep '^backend_dynamodb_table' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "infra_config_bucket=$(grep '^infra_config_bucket =' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "terraform_outputs_prefix=$(grep '^terraform_outputs_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
- name: Set Region
run: |
echo "AWS_REGION=${{steps.vars.outputs.aws_region}}" >> $GITHUB_ENV
- name: Set AWS Credentials
uses: ./.github/actions/set-aws-credentials
with:
ENVIRONMENT_NAME: ${{ env.ENV_NAME }}
PROD_AWS_ACCESS_KEY_ID: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
PROD_AWS_SECRET_ACCESS_KEY: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
STAGE_AWS_ACCESS_KEY_ID: ${{ secrets.STAGE_AWS_ACCESS_KEY_ID }}
STAGE_AWS_SECRET_ACCESS_KEY: ${{ secrets.STAGE_AWS_SECRET_ACCESS_KEY }}
DEV_AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
DEV_AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
- name: Set NEO4J Credentials
id: neo4j-creds
run: |
echo "${{steps.vars.outputs.aws_region}}"
credentials_arn=""
if [ "${{ env.ENV_NAME }}" == "dev" ]; then
credentials_arn="${{ secrets.AWS_DEV_NEO4J_CREDENTIALS }}"
elif [ "${{ env.ENV_NAME }}" == "prod" ]; then
credentials_arn="${{ secrets.AWS_PROD_NEO4J_CREDENTIALS }}"
elif [ "${{ env.ENV_NAME }}" == "stage" ]; then
credentials_arn="${{ secrets.AWS_STAGE_NEO4J_CREDENTIALS }}"
elif [ "${{ env.ENV_NAME }}" == "test" ]; then
credentials_arn="${{ secrets.AWS_TEST_NEO4J_CREDENTIALS }}"
fi
echo "NEO4J_CREDENTIALS_ARN=$credentials_arn" >> $GITHUB_OUTPUT
- name: Get neo4j secrets
uses: aws-actions/aws-secretsmanager-get-secrets@v1
with:
secret-ids: |
NEO4J_CREDS, ${{ steps.neo4j-creds.outputs.NEO4J_CREDENTIALS_ARN }}
parse-json-secrets: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Ensure S3 bucket exists
run: |
if ! aws s3 ls "s3://${{ steps.vars.outputs.infra_config_bucket }}" 2>&1 | grep -q 'NoSuchBucket'; then
echo "Bucket exists."
else
echo "Bucket does not exist. Creating bucket..."
aws s3 mb "s3:/${{ steps.vars.outputs.infra_config_bucket }}"
aws s3api put-bucket-versioning --bucket ${{ steps.vars.outputs.infra_config_bucket }} --versioning-configuration Status=Enabled
fi
- name: Ensure DynamoDB table exists
run: |
TABLE_NAME="${{ steps.vars.outputs.backend_dynamodb_table }}"
REGION="${{ env.AWS_REGION }}"
if aws dynamodb describe-table --table-name $TABLE_NAME 2>&1 | grep -q 'ResourceNotFoundException'; then
echo "DynamoDB table does not exist. Creating table..."
aws dynamodb create-table \
--table-name $TABLE_NAME \
--attribute-definitions AttributeName=LockID,AttributeType=S \
--key-schema AttributeName=LockID,KeyType=HASH \
--billing-mode PAY_PER_REQUEST \
--region $REGION
echo "DynamoDB table created."
else
echo "DynamoDB table exists."
fi
- name: Set up Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: 1.7.0
- name: Set Terraform Variables from Environment File
run: |
jq -r 'to_entries|map("\(.key)=\(.value|tostring)")|.[]' ${{ env.ENV_FILE }} > env_vars
while IFS= read -r line; do
echo "$line" >> $GITHUB_ENV
done < env_vars
- name: Initialize Terraform
run: terraform init -upgrade
working-directory: ./infra/core
- name: Validate Terraform
run: terraform validate
working-directory: ./infra/core
- name: Plan Terraform
id: plan
run: terraform plan -var-file="${{ env.ENV_NAME }}.tfvars"
working-directory: ./infra/core
env:
AWS_DEFAULT_REGION: ${{ env.AWS_REGION }}
TF_VAR_alert_email: ${{ secrets.ALERT_EMAIL }}
TF_VAR_home_ip: ${{ secrets.HOME_IP }}
TF_VAR_neo4j_password: ${{ env.NEO4J_CREDS_NEO4J_PASSWORD }}
TF_VAR_neo4j_username: ${{ env.NEO4J_CREDS_NEO4J_USERNAME }}
- name: Apply Terraform
run: |
terraform apply -var-file="${{ env.ENV_NAME }}.tfvars" -auto-approve
working-directory: ./infra/core
env:
AWS_DEFAULT_REGION: ${{ env.AWS_REGION }}
TF_VAR_alert_email: ${{ secrets.ALERT_EMAIL }}
TF_VAR_home_ip: ${{ secrets.HOME_IP }}
TF_VAR_neo4j_password: ${{ env.NEO4J_CREDS_NEO4J_PASSWORD }}
TF_VAR_neo4j_username: ${{ env.NEO4J_CREDS_NEO4J_USERNAME }}
- name: Save Terraform Outputs
run: |
terraform output -json > terraform_outputs.json
aws s3 cp terraform_outputs.json s3://${{ steps.vars.outputs.infra_config_bucket }}/terraform/${{ env.ENV_NAME }}-${{ steps.vars.outputs.terraform_outputs_prefix }}.json
working-directory: ./infra/core
- name: Upload Terraform Outputs
uses: actions/upload-artifact@v4
with:
name: terraform_outputs
path: ./infra/core/terraform_outputs.json
upload_infra_files:
runs-on: ubuntu-latest
needs: create_infra
strategy:
matrix:
python-version: ["3.10"]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install jq
run: sudo apt-get install jq
- name: Install node and npm
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install NPM Dependencies
run: |
npm install
- name: Load Environment Variables
uses: ./.github/actions/load-env-variables
- name: Extract variables
id: vars
run: |
echo "airflow_dags_env_path=$(grep '^airflow_dags_env_path' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "app_name=$(grep '^app_name' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "arxiv_api_max_retries=$(grep '^arxiv_api_max_retries' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "arxiv_base_url=$(grep '^arxiv_base_url' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'=' '{print $2}' | xargs)" >> $GITHUB_OUTPUT
echo "arxiv_ingestion_day_span=$(grep '^arxiv_ingestion_day_span' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "arxiv_sets=$(awk -F'=' '/^arxiv_sets/ {gsub(/ /, "", $2); print $2}' infra/core/${{ env.ENV_NAME }}.tfvars)" >> $GITHUB_OUTPUT
echo "aws_region=$(grep '^aws_region' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "backend_dynamodb_table=$(grep '^backend_dynamodb_table' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "data_ingestion_key_prefix=$(grep '^data_ingestion_key_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "fetch_from_arxiv_task_version=$(grep '^fetch_from_arxiv_task_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "infra_config_bucket=$(grep '^infra_config_bucket =' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "most_recent_research_records_version=$(grep '^most_recent_research_records_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "neo4j_connection_retries=$(grep '^neo4j_connection_retries' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "terraform_outputs_prefix=$(grep '^terraform_outputs_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
- name: Set Region
run: |
echo "AWS_REGION=${{steps.vars.outputs.aws_region}}" >> $GITHUB_ENV
- name: Set AWS Credentials
uses: ./.github/actions/set-aws-credentials
with:
ENVIRONMENT_NAME: ${{ env.ENV_NAME }}
PROD_AWS_ACCESS_KEY_ID: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
PROD_AWS_SECRET_ACCESS_KEY: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
STAGE_AWS_ACCESS_KEY_ID: ${{ secrets.STAGE_AWS_ACCESS_KEY_ID }}
STAGE_AWS_SECRET_ACCESS_KEY: ${{ secrets.STAGE_AWS_SECRET_ACCESS_KEY }}
DEV_AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
DEV_AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
- name: Download Terraform Outputs Artifact
uses: actions/download-artifact@v4
with:
name: terraform_outputs
path: outputs
- name: Extract Variables from Terraform Outputs
id: terraform_vars
run: |
echo "Extracting instance IP..."
echo "orchestration_host_private_ip=$(jq -r '.orchestration_host_private_ip.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
- name: Copy Airflow files to s3
run: |
aws s3 cp orchestration/airflow s3://${{ steps.vars.outputs.infra_config_bucket }}/orchestration/${{ env.ENV_NAME }}/airflow --recursive --exclude "*.pyc" --exclude "__pycache__" --exclude "orchestration/airflow/dags/tests"
- name: Update Kafka docker-compose.yaml with instance ip
id: update_kafka_docker_compose
run: |
echo "Updating Kafka docker-compose with instance ip..."
sed -i "s/REPLACE_THIS_IP/${{ steps.terraform_vars.outputs.orchestration_host_private_ip }}/g" orchestration/kafka/host_config/docker-compose.yaml
- name: Copy Kafka files to s3
run: |
aws s3 cp orchestration/kafka s3://${{ steps.vars.outputs.infra_config_bucket }}/orchestration/${{ env.ENV_NAME }}/kafka --recursive --exclude "*.pyc" --exclude "__pycache__"
deploy:
needs: upload_infra_files
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install jq
run: sudo apt-get install jq
- name: Install node and npm
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install NPM Dependencies
run: |
npm install
- name: Load Environment Variables
uses: ./.github/actions/load-env-variables
- name: Extract variables
id: vars
run: |
echo "airflow_dags_env_path=$(grep '^airflow_dags_env_path' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "app_name=$(grep '^app_name' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "arxiv_api_max_retries=$(grep '^arxiv_api_max_retries' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "arxiv_base_url=$(grep '^arxiv_base_url' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'=' '{print $2}' | xargs)" >> $GITHUB_OUTPUT
echo "arxiv_ingestion_day_span=$(grep '^arxiv_ingestion_day_span' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "arxiv_sets=$(awk -F'=' '/^arxiv_sets/ {gsub(/ /, "", $2); print $2}' infra/core/${{ env.ENV_NAME }}.tfvars)" >> $GITHUB_OUTPUT
echo "aws_region=$(grep '^aws_region' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "backend_dynamodb_table=$(grep '^backend_dynamodb_table' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "create_pod_task_version=$(grep '^create_pod_task_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "parse_summaries_task_version=$(grep '^parse_summaries_task_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "data_ingestion_key_prefix=$(grep '^data_ingestion_key_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "etl_key_prefix=$(grep '^etl_key_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "fetch_from_arxiv_task_version=$(grep '^fetch_from_arxiv_task_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "infra_config_bucket=$(grep '^infra_config_bucket =' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "most_recent_research_records_version=$(grep '^most_recent_research_records_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "neo4j_connection_retries=$(grep '^neo4j_connection_retries' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "parse_summaries_task_version=$(grep '^parse_summaries_task_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "persist_summaries_task_version=$(grep '^persist_summaries_task_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "pods_prefix=$(grep '^pods_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "records_prefix=$(grep '^records_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "save_summaries_to_datalake_task_version=$(grep '^save_summaries_to_datalake_task_version' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
echo "terraform_outputs_prefix=$(grep '^terraform_outputs_prefix' infra/core/${{ env.ENV_NAME }}.tfvars | awk -F'[= "]+' '{print $2}')" >> $GITHUB_OUTPUT
- name: Set Region
run: |
echo "Setting AWS region for cli commands..."
echo "AWS_REGION=${{steps.vars.outputs.aws_region}}" >> $GITHUB_ENV
- name: Set AWS Credentials
uses: ./.github/actions/set-aws-credentials
with:
ENVIRONMENT_NAME: ${{ env.ENV_NAME }}
PROD_AWS_ACCESS_KEY_ID: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
PROD_AWS_SECRET_ACCESS_KEY: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
STAGE_AWS_ACCESS_KEY_ID: ${{ secrets.STAGE_AWS_ACCESS_KEY_ID }}
STAGE_AWS_SECRET_ACCESS_KEY: ${{ secrets.STAGE_AWS_SECRET_ACCESS_KEY }}
DEV_AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
DEV_AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
- name: Download Terraform Outputs Artifact
uses: actions/download-artifact@v4
with:
name: terraform_outputs
path: outputs
- name: Extract Variables from Terraform Outputs
id: terraform_vars
run: |
echo "Extracting instance ID..."
echo "arxiv_research_ingestion_event_schema=$(jq -r '.arxiv_research_ingestion_event_schema.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
echo "aws_glue_registry_name=$(jq -r '.aws_glue_registry_name.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
echo "data_arxiv_summaries_ingestion_complete=$(jq -r '.data_arxiv_summaries_ingestion_complete.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
echo "data_bucket=$(jq -r '.data_bucket.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
echo "instance_id=$(jq -r '.orchestration_instance_id.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
echo "neo4j_instance_private_ip=$(jq -r '.neo4j_instance_private_ip.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
echo "orchestration_host_private_ip=$(jq -r '.orchestration_host_private_ip.value' outputs/terraform_outputs.json)" >> $GITHUB_OUTPUT
# single quotes for arxiv_sets to prevent shell expansion of list values
- name: Create and Upload DAGS Environment File
id: create_airflow_env_file
run: |
echo "Creating DAGS environment file..."
echo "AIRFLOW_DAGS_ENV_PATH=${{ steps.vars.outputs.airflow_dags_env_path }}" >> .env
echo "ARXIV_API_MAX_RETRIES=${{ steps.vars.outputs.arxiv_api_max_retries }}" >> .env
echo "ARXIV_BASE_URL=${{ steps.vars.outputs.arxiv_base_url }}" >> .env
echo "ARXIV_INGESTION_DAY_SPAN=${{ steps.vars.outputs.arxiv_ingestion_day_span }}" >> .env
echo "ARXIV_RESEARCH_INGESTION_EVENT_SCHEMA=${{ steps.terraform_vars.outputs.arxiv_research_ingestion_event_schema }}" >> .env
echo 'ARXIV_SETS=${{ steps.vars.outputs.arxiv_sets }}' >> .env
echo "AWS_GLUE_REGISTRY_NAME=${{ steps.terraform_vars.outputs.aws_glue_registry_name }}" >> .env
echo "AWS_REGION=${{ steps.vars.outputs.aws_region }}" >> .env
echo "CREATE_POD_TASK_VERSION=${{ steps.vars.outputs.create_pod_task_version }}" >> .env
echo "DATA_BUCKET=${{ steps.terraform_vars.outputs.data_bucket }}" >> .env
echo "DATA_INGESTION_KEY_PREFIX=${{ steps.vars.outputs.data_ingestion_key_prefix }}" >> .env
echo "ETL_KEY_PREFIX=${{ steps.vars.outputs.etl_key_prefix }}" >> .env
echo "ENVIRONMENT_NAME=${{ env.ENV_NAME }}" >> .env
echo "FETCH_FROM_ARXIV_TASK_VERSION=${{ steps.vars.outputs.fetch_from_arxiv_task_version }}" >> .env
echo "MOST_RECENT_RESEARCH_RECORDS_VERSION=${{ steps.vars.outputs.most_recent_research_records_version }}" >> .env
echo "NEO4J_CONNECTION_RETRIES=${{ steps.vars.outputs.neo4j_connection_retries }}" >> .env
echo "NEO4J_URI=neo4j://${{ steps.terraform_vars.outputs.neo4j_instance_private_ip }}:7687" >> .env
echo "ORCHESTRATION_HOST_PRIVATE_IP=${{ steps.terraform_vars.outputs.orchestration_host_private_ip }}" >> .env
echo "PARSE_SUMMARIES_TASK_VERSION=${{ steps.vars.outputs.parse_summaries_task_version }}" >> .env
echo "PERSIST_SUMMARIES_TASK_VERSION=${{ steps.vars.outputs.persist_summaries_task_version }}" >> .env
echo "PODS_PREFIX=${{ steps.vars.outputs.pods_prefix }}" >> .env
echo "RECORDS_PREFIX=${{ steps.vars.outputs.records_prefix }}" >> .env
echo "SAVE_SUMMARIES_TO_DATALAKE_TASK_VERSION=${{ steps.vars.outputs.save_summaries_to_datalake_task_version }}" >> .env
cat .env
echo "Copying file to s3..."
aws s3 cp .env s3://${{ steps.vars.outputs.infra_config_bucket }}/orchestration/${{ env.ENV_NAME }}/airflow/dags/.env
- name: Fetch specific commits for PR
if: github.event_name == 'pull_request'
run: |
git fetch origin +${{ github.event.pull_request.base.sha }}
git fetch origin +${{ github.event.pull_request.head.sha }}
- name: Check for Changes
id: check_changes
run: |
echo "Determining context and checking for changes..."
if ! git rev-parse --git-dir > /dev/null 2>&1; then
echo "Error: Current directory is not a valid Git repository."
exit 1
fi
if [[ $(git rev-list --count HEAD) -eq 0 ]]; then
echo "Error: No commits found in the repository."
exit 1
fi
if [[ "push" == "pull_request" ]]; then
CHANGED_FILES=$(git diff --name-only ..)
else
git fetch --depth=2 # Ensure at least two commits are fetched
CHANGED_FILES=$(git show --name-only HEAD --pretty="")
fi
if echo "$CHANGED_FILES" | grep -qi 'infra/core/orchestration/'; then
echo "orchestration_host_rebuild=true" >> $GITHUB_OUTPUT
else
echo "orchestration_host_rebuild=false" >> $GITHUB_OUTPUT
fi
AIRFLOW_REBUILD=false
for file in $CHANGED_FILES; do
if [[ $file =~ ^orchestration/airflow/(requirements.txt|Dockerfile|docker-compose.yaml|host_config/|plugins/) ]]; then
AIRFLOW_REBUILD=true
break
fi
done
if [[ $AIRFLOW_REBUILD == true ]]; then
echo "airflow_rebuild=true" >> $GITHUB_OUTPUT
else
echo "airflow_rebuild=false" >> $GITHUB_OUTPUT
fi
if echo "$CHANGED_FILES" | grep -qi 'orchestration/kafka/host_config/'; then
echo "kafka_rebuild=true" >> $GITHUB_OUTPUT
else
echo "kafka_rebuild=false" >> $GITHUB_OUTPUT
fi
- name: Setup Airflow
id: setup_airflow
if: steps.check_changes.outputs.orchestration_host_rebuild == 'true'
run: |
echo "Setting up Airflow..."
aws ssm send-command \
--document-name "AWS-RunShellScript" \
--instance-ids ${{ steps.terraform_vars.outputs.instance_id }} \
--parameters "{\"commands\":[\"sudo chmod +x /data/airflow/host_config/*.sh\",\"sudo /data/airflow/host_config/sync_s3.sh\",\"sudo /data/airflow/host_config/create.sh\"]}" \
--region ${{ steps.vars.outputs.aws_region }} \
--output text
- name: Rebuild Airflow Docker if Required
id: rebuild_airflow
if: steps.check_changes.outputs.airflow_rebuild == 'true' && steps.check_changes.outputs.orchestration_host_rebuild == 'false'
run: |
echo "Rebuilding Airflow Docker image..."
aws ssm send-command \
--document-name "AWS-RunShellScript" \
--instance-ids ${{ steps.terraform_vars.outputs.instance_id }} \
--parameters "{\"commands\":[\"sudo chmod +x /data/airflow/host_config/*.sh\",\"sudo /data/airflow/host_config/sync_s3.sh\",\"sudo /data/airflow/host_config/rebuild.sh\"]}" \
--region ${{ steps.vars.outputs.aws_region }} \
--output text
- name: Sync Airflow DAGs if Rebuild not Required
id: sync_airflow_dags
if: steps.check_changes.outputs.airflow_rebuild == 'false' && steps.check_changes.outputs.orchestration_host_rebuild == 'false'
run: |
echo "Syncing Airflow DAGs..."
aws ssm send-command \
--document-name "AWS-RunShellScript" \
--instance-ids ${{ steps.terraform_vars.outputs.instance_id }} \
--parameters "{\"commands\":[\"sudo chmod +x /data/airflow/host_config/*.sh\",\"sudo /data/airflow/host_config/sync_s3.sh\"]}" \
--region ${{ steps.vars.outputs.aws_region }} \
--output text
- name: Create and Upload Kafka Scripts Environment File
id: create_kafka_env_file
run: |
echo "Creating Kafka scripts environment file..."
echo "AWS_REGION=${{ steps.vars.outputs.aws_region }}" >> kafka.env
echo "ENVIRONMENT_NAME=${{ env.ENV_NAME }}" >> kafka.env
echo "ORCHESTRATION_HOST_PRIVATE_IP=${{ steps.terraform_vars.outputs.orchestration_host_private_ip }}" >> kafka.env
echo "Copying file to s3..."
aws s3 cp kafka.env s3://${{ steps.vars.outputs.infra_config_bucket }}/orchestration/${{ env.ENV_NAME }}/kafka/topics/.env
- name: Setup Kafka
if: steps.check_changes.outputs.orchestration_host_rebuild == 'true'
run: |
echo "Setting up Kafka..."
aws ssm send-command \
--document-name "AWS-RunShellScript" \
--instance-ids ${{ steps.terraform_vars.outputs.instance_id }} \
--parameters "{\"commands\":[\"sudo chmod +x /data/kafka/host_config/*.sh\",\"sudo /data/kafka/host_config/sync_s3.sh\",\"sudo /data/kafka/host_config/create.sh\",\"sudo /data/kafka/host_config/update_topics.sh\"]}" \
--region ${{ steps.vars.outputs.aws_region }} \
--output text
- name: Rebuild Kafka Docker if Required
if: steps.check_changes.outputs.kafka_rebuild == 'true' && steps.check_changes.outputs.orchestration_host_rebuild == 'false'
run: |
echo "Rebuilding Kafka Docker image..."
aws ssm send-command \
--document-name "AWS-RunShellScript" \
--instance-ids ${{ steps.terraform_vars.outputs.instance_id }} \
--parameters "{\"commands\":[\"sudo chmod +x /data/kafka/host_config/*.sh\",\"sudo /data/kafka/host_config/sync_s3.sh\",\"sudo /data/kafka/host_config/rebuild.sh\",\"sudo /data/kafka/host_config/update_topics.sh\"]}" \
--region ${{ steps.vars.outputs.aws_region }} \
--output text
- name: Sync Kafka Topics if Rebuild not Required
if: steps.check_changes.outputs.kafka_rebuild == 'false'
run: |
echo "Syncing Kafka topics..."
aws ssm send-command \
--document-name "AWS-RunShellScript" \
--instance-ids ${{ steps.terraform_vars.outputs.instance_id }} \
--parameters "{\"commands\":[\"sudo chmod +x /data/kafka/host_config/*.sh\",\"sudo /data/kafka/host_config/sync_s3.sh\",\"sudo /data/kafka/host_config/update_topics.sh\"]}" \
--region ${{ steps.vars.outputs.aws_region }} \
--output text