Skip to content

Commit

Permalink
Add in dbt project
Browse files Browse the repository at this point in the history
  • Loading branch information
ilias1111 committed Nov 11, 2024
1 parent 8e4fc8a commit d496e5b
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 56 deletions.
72 changes: 22 additions & 50 deletions .github/workflows/spark_deployment/spark-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,39 @@ spark.sql.catalog.glue org.apache.iceberg.spark.SparkCat
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.catalog.glue.lock-impl org.apache.iceberg.aws.glue.DynamoLockManager
spark.sql.catalog.glue.lock.table myGlueLockTable

# Default Schema Configuration
spark.sql.catalog.glue.default-namespace default_snowplow_manifest

# Session Extensions
# Critical Iceberg Settings
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
spark.sql.iceberg.handle-timestamp-without-timezone true
spark.wds.iceberg.format-version 2

# General Spark Configuration
spark.master local[3]
spark.driver.memory 10g
spark.executor.memory 3g
spark.memory.fraction 0.85
spark.sql.adaptive.enabled true
# Enhanced Iceberg Write Settings
spark.sql.iceberg.write.distribution-mode range
spark.sql.iceberg.write.accept-any-schema true
spark.sql.iceberg.write.merge.mode copy-on-write
spark.sql.iceberg.write.format.default parquet
spark.sql.iceberg.write-partitioned-fanout.enabled true

# Iceberg Specific Configuration
spark.wds.iceberg.format-version 2
spark.sql.iceberg.handle-timestamp-without-timezone true
spark.sql.catalog.spark_catalog.type hive
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
# Performance Settings
spark.sql.adaptive.enabled true
spark.sql.adaptive.coalescePartitions.enabled true
spark.sql.shuffle.partitions 10
spark.sql.parquet.compression.codec zstd

# AWS Configuration
# AWS S3 Settings
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.connection.ssl.enabled true
spark.hadoop.fs.s3a.committer.name directory
spark.hadoop.fs.s3a.committer.staging.tmp.path /tmp/spark_staging
spark.hadoop.fs.s3a.buffer.dir /tmp/spark_local_buf
spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled true

# Write and Format Configuration
spark.sql.parquet.compression.codec zstd
spark.sql.parquet.filterPushdown true
spark.sql.hive.metastorePartitionPruning true
spark.sql.streaming.schemaInference true

# Operation Settings
spark.sql.sources.partitionOverwriteMode dynamic
spark.sql.shuffle.partitions 6
spark.sql.broadcastTimeout 300
spark.network.timeout 300s

# Transaction and Consistency
spark.sql.sources.default iceberg
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
spark.sql.hive.thriftServer.async true
spark.sql.hive.thriftServer.maxWorkerThreads 6
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
spark.sql.iceberg.handle-timestamp-without-timezone true
# Memory Settings
spark.driver.memory 10g
spark.executor.memory 3g
spark.memory.fraction 0.85

spark.sql.iceberg.write.distribution-mode hash
spark.sql.iceberg.write.accept-any-schema true
spark.sql.iceberg.write.merge-schema true
spark.sql.iceberg.write.upsert.enabled true
spark.sql.iceberg.write.append.enabled true
# Default Source Settings
spark.sql.sources.default iceberg
spark.sql.sources.partitionOverwriteMode dynamic
21 changes: 15 additions & 6 deletions integration_tests/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ quoting:
models:
snowplow_unified_integration_tests:
+materialized: table
+tblproperties:
write.format.default: parquet
write.metadata.delete-after-commit.enabled: true
write.distribution-mode: hash
write.merge.mode: copy-on-write
format-version: '2'
write.target-file-size-bytes: '536870912'
write.metadata.previous-versions-max: '100'
write.update.mode: merge-on-read
bind: false
+schema: "snplw_unified_int_tests"
source:
Expand All @@ -43,14 +52,14 @@ models:
snowplow_unified:
+file_format: iceberg
+tblproperties:
write.format.default: parquet
write.metadata.delete-after-commit.enabled: true
write.distribution-mode: hash
write.merge.mode: copy-on-write
format-version: '2'
write.target-file-size-bytes: '536870912'
write.metadata.previous-versions-max: '100'
write.update.mode: merge-on-read
write.delete.mode: merge-on-read
write.format.default: parquet
engine.hive.enabled: true
write.distribution-mode: none
write.wap.enabled : true
write.object-storage.enabled: true

vars:
snowplow_unified:
Expand Down

0 comments on commit d496e5b

Please sign in to comment.