diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 00000000..38f7463e Binary files /dev/null and b/.DS_Store differ diff --git a/databricks-s3-volume/Makefile b/databricks-s3-volume/Makefile new file mode 100644 index 00000000..bf1f5140 --- /dev/null +++ b/databricks-s3-volume/Makefile @@ -0,0 +1,12 @@ +# Auto-generated by fogg. Do not edit +# Make improvements in fogg, so that everyone can benefit. + +export TERRAFORM_VERSION := 1.3.0 +export TF_PLUGIN_CACHE_DIR := ../../..//.terraform.d/plugin-cache + +include ../../..//scripts/module.mk + + +help: ## display help for this makefile + @fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' +.PHONY: help diff --git a/databricks-s3-volume/README.md b/databricks-s3-volume/README.md new file mode 100644 index 00000000..bf331384 --- /dev/null +++ b/databricks-s3-volume/README.md @@ -0,0 +1,73 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | n/a | +| [databricks](#provider\_databricks) | n/a | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [databricks\_bucket](#module\_databricks\_bucket) | github.com/chanzuckerberg/cztack//aws-s3-private-bucket | v0.71.0 | + +## Resources + +| Name | Type | +|------|------| +| [aws_iam_policy.dbx_unity_access_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.dbx_unity_aws_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy_attachment.dbx_unity_aws_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [databricks_catalog.volume](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/catalog) | resource | +| [databricks_external_location.volume](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/external_location) | resource | +| [databricks_grant.catalog_r](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grant) | resource | +| [databricks_grant.catalog_rw](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grant) | resource | +| [databricks_grant.schema_r](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grant) | resource | +| [databricks_grant.schema_rw](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grant) | resource | +| [databricks_grant.volume_r](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grant) | resource | +| [databricks_grant.volume_rw](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/grant) | resource | +| [databricks_schema.volume](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/schema) | resource | +| [databricks_storage_credential.volume](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/storage_credential) | resource | +| [databricks_volume.volume](https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/volume) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_iam_policy_document.databricks-s3](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.dbx_unity_aws_role_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.volume_bucket_dbx_unity_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_rw\_bucket\_grant\_arns](#input\_additional\_rw\_bucket\_grant\_arns) | (Optional) Additional AWS ARNs to grant read/write permissions to on the bucket (may be necessary for service principals, instance profiles, or users | `list(string)` | `[]` | no | +| [bucket\_object\_ownership](#input\_bucket\_object\_ownership) | Set default owner of all objects within bucket (e.g., bucket vs. object owner) | `string` | `null` | no | +| [catalog\_name](#input\_catalog\_name) | Name of the Databricks existing catalog to add the volume to | `string` | n/a | yes | +| [catalog\_owner](#input\_catalog\_owner) | User or group name of the catalog owner | `string` | n/a | yes | +| [catalog\_r\_grant\_principals](#input\_catalog\_r\_grant\_principals) | (Optional) Databricks groups to grant read-only permissions to on the catalog | `list(string)` | `[]` | no | +| [catalog\_rw\_grant\_principals](#input\_catalog\_rw\_grant\_principals) | (Optional) Databricks groups to grant read/write permissions to on the catalog | `list(string)` | `[]` | no | +| [metastore\_id](#input\_metastore\_id) | ID of metastore to create catalog in | `string` | n/a | yes | +| [schema\_r\_grant\_principals](#input\_schema\_r\_grant\_principals) | (Optional) Databricks groups to grant read-only permissions to on the schema | `list(string)` | `[]` | no | +| [schema\_rw\_grant\_principals](#input\_schema\_rw\_grant\_principals) | (Optional) Databricks groups to grant read/write permissions to on the schema | `list(string)` | `[]` | no | +| [tags](#input\_tags) | REQUIRED: Tags to include for this environment. |
object({
project : string
env : string
service : string
owner : string
managedBy : string
})
| n/a | yes | +| [volume\_bucket](#input\_volume\_bucket) | (Optional) Name of an existing S3 bucket to use for Databricks volume. NOTE: if provided, you will need to update the bucket policy whereever it is defined to allow Databricks access | `string` | `null` | no | +| [volume\_comment](#input\_volume\_comment) | (Optional) Comment to add to the Databricks volume | `string` | `"Managed by Terraform - this is a default volume for the Databricks workspace"` | no | +| [volume\_name](#input\_volume\_name) | Name of the Databricks volume to create | `string` | n/a | yes | +| [volume\_r\_grant\_principals](#input\_volume\_r\_grant\_principals) | (Optional) Databricks groups to grant read-only permissions to on the volume | `list(string)` | `[]` | no | +| [volume\_rw\_grant\_principals](#input\_volume\_rw\_grant\_principals) | (Optional) Databricks groups to grant read/write permissions to on the volume | `list(string)` | `[]` | no | +| [volume\_schema\_properties](#input\_volume\_schema\_properties) | Properties of the Databricks schema to add the volume to | `map(string)` | `{}` | no | +| [workspace\_name](#input\_workspace\_name) | Name of the Databricks catalog to add the volume to | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [dbx\_unity\_aws\_role\_arn](#output\_dbx\_unity\_aws\_role\_arn) | n/a | +| [volume\_path](#output\_volume\_path) | n/a | +| [volume\_specific\_bucket\_name](#output\_volume\_specific\_bucket\_name) | n/a | + \ No newline at end of file diff --git a/databricks-s3-volume/bucket.tf b/databricks-s3-volume/bucket.tf new file mode 100644 index 00000000..547dcd3f --- /dev/null +++ b/databricks-s3-volume/bucket.tf @@ -0,0 +1,113 @@ +locals { + standard_grant_principals = concat(["arn:aws:iam::${local.databricks_aws_account}:root"], var.additional_rw_bucket_grant_arns) +} + +data "aws_iam_policy_document" "databricks-s3" { + count = var.volume_bucket != null ? 0 : 1 + + # standard UC access + statement { + sid = "dbxBucketAccess" + effect = "Allow" + principals { + type = "AWS" + identifiers = local.standard_grant_principals + } + actions = [ + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:GetLifecycleConfiguration", + "s3:PutLifecycleConfiguration", + ] + resources = [ + "arn:aws:s3:::${local.bucket_name}", + ] + } + statement { + sid = "dbxObjAccess" + effect = "Allow" + principals { + type = "AWS" + identifiers = local.standard_grant_principals + } + actions = [ + "s3:GetObject", + "s3:GetObjectVersion", + "s3:PutObject", + "s3:DeleteObject", + ] + resources = [ + "arn:aws:s3:::${local.bucket_name}/*" # root access + ] + } + # storage credential access - uses string to avoid race condition of role v. bucket creation + statement { + sid = "dbxSCBucketAccess" + effect = "Allow" + principals { + type = "AWS" + identifiers = [ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root", + ] + } + condition { + test = "ArnEquals" + variable = "aws:PrincipalArn" + values = [ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role${local.path}${local.unity_aws_role_name}", + ] + } + actions = [ + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:GetLifecycleConfiguration", + "s3:PutLifecycleConfiguration", + ] + resources = [ + "arn:aws:s3:::${local.bucket_name}", + ] + } + statement { + sid = "dbxSCObjAccess" + effect = "Allow" + principals { + type = "AWS" + identifiers = [ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root", + ] + } + condition { + test = "ArnEquals" + variable = "aws:PrincipalArn" + values = [ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role${local.path}${local.unity_aws_role_name}", + ] + } + actions = [ + "s3:GetObject", + "s3:GetObjectVersion", + "s3:PutObject", + "s3:DeleteObject", + ] + resources = [ + "arn:aws:s3:::${local.bucket_name}/*" + ] + } + +} + +module "databricks_bucket" { + count = var.volume_bucket != null ? 0 : 1 + depends_on = [ + aws_iam_role.dbx_unity_aws_role + ] + + source = "github.com/chanzuckerberg/cztack//aws-s3-private-bucket?ref=v0.71.0" + bucket_name = local.bucket_name + bucket_policy = data.aws_iam_policy_document.databricks-s3[0].json + project = var.tags["project"] + env = var.tags["env"] + service = var.tags["service"] + owner = var.tags["owner"] + object_ownership = var.bucket_object_ownership +} \ No newline at end of file diff --git a/databricks-s3-volume/fogg.tf b/databricks-s3-volume/fogg.tf new file mode 100644 index 00000000..8c7ad42a --- /dev/null +++ b/databricks-s3-volume/fogg.tf @@ -0,0 +1,2 @@ +# Auto-generated by fogg. Do not edit +# Make improvements in fogg, so that everyone can benefit. diff --git a/databricks-s3-volume/grants.tf b/databricks-s3-volume/grants.tf new file mode 100644 index 00000000..4cc520db --- /dev/null +++ b/databricks-s3-volume/grants.tf @@ -0,0 +1,70 @@ +# catalog +resource "databricks_grant" "catalog_r" { + for_each = toset(var.catalog_r_grant_principals) + catalog = databricks_catalog.volume.name + principal = each.value + privileges = ["USE_CATALOG", "USE_SCHEMA", "SELECT"] +} + +resource "databricks_grant" "catalog_rw" { + for_each = toset(var.catalog_rw_grant_principals) + catalog = databricks_catalog.volume.name + principal = "Data Scientists" + privileges = [ + "APPLY_TAG", + "CREATE_CONNECTION", + "CREATE_SCHEMA", + "USE_CATALOG", + "CREATE_FUNCTION", + "CREATE_TABLE", + "EXECUTE", + "MODIFY", + "REFRESH", + "SELECT", + "READ_VOLUME", + "WRITE_VOLUME", + "USE_SCHEMA", + ] +} + +# schema +resource "databricks_grant" "schema_r" { + for_each = toset(var.schema_r_grant_principals) + schema = databricks_schema.volume.id + principal = each.value + privileges = ["USE_SCHEMA", "SELECT", "READ_VOLUME"] +} + +resource "databricks_grant" "schema_rw" { + for_each = toset(var.schema_rw_grant_principals) + schema = databricks_schema.volume.id + principal = each.value + privileges = [ + "APPLY_TAG", + "CREATE_FUNCTION", + "CREATE_TABLE", + "CREATE_VOLUME", + "USE_SCHEMA", + "EXECUTE", + "MODIFY", + "REFRESH", + "SELECT", + "READ_VOLUME", + "WRITE_VOLUME" + ] +} + +# volume +resource "databricks_grant" "volume_r" { + for_each = toset(var.volume_r_grant_principals) + volume = databricks_volume.volume.id + principal = each.value + privileges = ["READ_VOLUME"] +} + +resource "databricks_grant" "volume_rw" { + for_each = toset(var.volume_rw_grant_principals) + volume = databricks_volume.volume.id + principal = each.value + privileges = ["READ_VOLUME", "WRITE_VOLUME"] +} \ No newline at end of file diff --git a/databricks-s3-volume/iam.tf b/databricks-s3-volume/iam.tf new file mode 100644 index 00000000..0fe0a334 --- /dev/null +++ b/databricks-s3-volume/iam.tf @@ -0,0 +1,93 @@ +## Databricks external location and IAM + +data "aws_caller_identity" "current" { + provider = aws +} + +data "aws_iam_policy_document" "dbx_unity_aws_role_assume_role" { + statement { + principals { + type = "AWS" + identifiers = ["arn:aws:iam::414351767826:role/unity-catalog-prod-UCMasterRole-14S5ZJVKOTYTL"] + } + + actions = ["sts:AssumeRole"] + condition { + test = "StringEquals" + variable = "sts:ExternalId" + + values = ["4a2f419c-ae7a-49f1-b774-8f3113d9834d"] + } + } + statement { + principals { + type = "AWS" + identifiers = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"] + } + + actions = ["sts:AssumeRole"] + condition { + test = "ArnEquals" + variable = "aws:PrincipalArn" + values = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:role${local.path}${local.unity_aws_role_name}"] + } + } +} + +resource "aws_iam_role" "dbx_unity_aws_role" { + name = local.unity_aws_role_name + path = local.path + assume_role_policy = data.aws_iam_policy_document.dbx_unity_aws_role_assume_role.json +} + +### Policy document to access default volume bucket and assume role +data "aws_iam_policy_document" "volume_bucket_dbx_unity_access" { + depends_on = [ + module.databricks_bucket + ] + + statement { + sid = "dbxSCBucketAccess" + effect = "Allow" + actions = [ + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:GetLifecycleConfiguration", + "s3:PutLifecycleConfiguration" + ] + resources = [ + "arn:aws:s3:::${local.bucket_name}", + ] + } + statement { + sid = "dbxSCObjAccess" + effect = "Allow" + actions = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + ] + resources = [ + "arn:aws:s3:::${local.bucket_name}/*", + ] + } + statement { + sid = "databricksAssumeRole" + effect = "Allow" + actions = [ + "sts:AssumeRole" + ] + resources = [ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role${local.path}${local.unity_aws_role_name}" + ] + } +} + +resource "aws_iam_policy" "dbx_unity_access_policy" { + policy = data.aws_iam_policy_document.volume_bucket_dbx_unity_access.json +} + +resource "aws_iam_role_policy_attachment" "dbx_unity_aws_access" { + policy_arn = aws_iam_policy.dbx_unity_access_policy.arn + role = aws_iam_role.dbx_unity_aws_role.name +} \ No newline at end of file diff --git a/databricks-s3-volume/main.tf b/databricks-s3-volume/main.tf new file mode 100644 index 00000000..58116c24 --- /dev/null +++ b/databricks-s3-volume/main.tf @@ -0,0 +1,71 @@ +# Volume bucket (UC supported) + +// https://docs.databricks.com/administration-guide/multiworkspace/iam-role.html#language-Your%C2%A0VPC,%C2%A0custom +locals { + unity_aws_role_name = "${var.catalog_name}-${var.volume_name}-unity" + catalog_name = replace(var.catalog_name, "-", "_") # SQL don't work with hyphens + schema_name = replace(var.volume_name, "-", "_") # SQL don't work with hyphens + + path = "/databricks/" + databricks_aws_account = "414351767826" # Databricks' own AWS account, not CZI's. See https://docs.databricks.com/en/administration-guide/account-settings-e2/credentials.html#step-1-create-a-cross-account-iam-role + bucket_name = var.volume_bucket != null ? var.volume_bucket : replace(var.catalog_name, "_", "-") # buckets don't work with underscores +} + +### Databricks storage credential - allows workspace to access an external location. +### NOTE: names need to be unique across an account, not just a workspace +### NOTE: + +resource "databricks_storage_credential" "volume" { + depends_on = [ + resource.aws_iam_role.dbx_unity_aws_role, + resource.aws_iam_role_policy_attachment.dbx_unity_aws_access, + module.databricks_bucket + ] + + name = local.catalog_name + aws_iam_role { + role_arn = aws_iam_role.dbx_unity_aws_role.arn + } + comment = "Managed by Terraform - access for ${var.volume_name}" +} + +resource "databricks_external_location" "volume" { + name = local.catalog_name + url = "s3://${local.bucket_name}" + credential_name = databricks_storage_credential.volume.name + comment = "Managed by Terraform - access for ${var.volume_name}" +} + +# New catalog, schema, and volume + +resource "databricks_catalog" "volume" { + depends_on = [databricks_external_location.volume] + name = local.catalog_name + metastore_id = var.metastore_id + owner = var.catalog_owner + storage_root = "s3://${local.bucket_name}" + comment = "this catalog is managed by terraform - default volume catalog for Databricks workspace ${var.workspace_name}" + properties = { + purpose = "this catalog is managed by terraform - default volume catalog for Databricks workspace ${var.workspace_name}" + } + isolation_mode = "ISOLATED" +} + +resource "databricks_schema" "volume" { + catalog_name = databricks_catalog.volume.name + name = local.schema_name + comment = "This schema is managed by Terraform - ${var.volume_comment}" + owner = var.catalog_owner + properties = var.volume_schema_properties +} + +resource "databricks_volume" "volume" { + depends_on = [databricks_external_location.volume] + name = "${local.catalog_name}_${local.schema_name}" + catalog_name = local.catalog_name + schema_name = databricks_schema.volume.name + volume_type = "EXTERNAL" + storage_location = "s3://${local.bucket_name}/${local.schema_name}" + owner = var.catalog_owner + comment = "This volume is managed by Terraform - ${var.volume_comment}" +} \ No newline at end of file diff --git a/databricks-s3-volume/outputs.tf b/databricks-s3-volume/outputs.tf new file mode 100644 index 00000000..01e2e90f --- /dev/null +++ b/databricks-s3-volume/outputs.tf @@ -0,0 +1,11 @@ +output "dbx_unity_aws_role_arn" { + value = aws_iam_role.dbx_unity_aws_role.arn +} + +output "volume_specific_bucket_name" { + value = length(module.databricks_bucket) > 0 ? module.databricks_bucket[0].name : null +} + +output "volume_path" { + value = "${databricks_catalog.volume.name}.${databricks_schema.volume.name}.${databricks_volume.volume.name}" +} \ No newline at end of file diff --git a/databricks-s3-volume/variables.tf b/databricks-s3-volume/variables.tf new file mode 100644 index 00000000..e442bc03 --- /dev/null +++ b/databricks-s3-volume/variables.tf @@ -0,0 +1,109 @@ + +variable "workspace_name" { + description = "Name of the Databricks catalog to add the volume to" + type = string +} + +variable "catalog_name" { + description = "Name of the Databricks existing catalog to add the volume to" + type = string +} + +variable "catalog_owner" { + description = "User or group name of the catalog owner" + type = string +} + +variable "metastore_id" { + description = "ID of metastore to create catalog in" + type = string +} + +variable "volume_name" { + description = "Name of the Databricks volume to create" + type = string +} + +variable "volume_bucket" { + description = "(Optional) Name of an existing S3 bucket to use for Databricks volume. NOTE: if provided, you will need to update the bucket policy whereever it is defined to allow Databricks access" + type = string + default = null +} + +variable "volume_comment" { + description = "(Optional) Comment to add to the Databricks volume" + type = string + default = "Managed by Terraform - this is a default volume for the Databricks workspace" +} + +variable "volume_schema_properties" { + description = "Properties of the Databricks schema to add the volume to" + type = map(string) + default = {} +} + +# check if argument is null or is in list (2nd parameter of contains() cannot be null) +variable "bucket_object_ownership" { + type = string + default = null + description = "Set default owner of all objects within bucket (e.g., bucket vs. object owner)" + + validation { + condition = var.bucket_object_ownership == null ? true : contains(["BucketOwnerEnforced", "BucketOwnerPreferred", "ObjectWriter"], var.bucket_object_ownership) + error_message = "Valid values for var.object_ownership are ('BucketOwnerEnforced', 'BucketOwnerPreferred', 'ObjectWriter')." + + } +} + +variable "catalog_r_grant_principals" { + description = "(Optional) Databricks groups to grant read-only permissions to on the catalog" + type = list(string) + default = [] +} + +variable "catalog_rw_grant_principals" { + description = "(Optional) Databricks groups to grant read/write permissions to on the catalog" + type = list(string) + default = [] +} + +variable "schema_r_grant_principals" { + description = "(Optional) Databricks groups to grant read-only permissions to on the schema" + type = list(string) + default = [] +} + +variable "schema_rw_grant_principals" { + description = "(Optional) Databricks groups to grant read/write permissions to on the schema" + type = list(string) + default = [] +} + +variable "volume_r_grant_principals" { + description = "(Optional) Databricks groups to grant read-only permissions to on the volume" + type = list(string) + default = [] +} + +variable "volume_rw_grant_principals" { + description = "(Optional) Databricks groups to grant read/write permissions to on the volume" + type = list(string) + default = [] +} + +variable "additional_rw_bucket_grant_arns" { + description = "(Optional) Additional AWS ARNs to grant read/write permissions to on the bucket (may be necessary for service principals, instance profiles, or users" + type = list(string) + default = [] +} + +variable "tags" { + description = "REQUIRED: Tags to include for this environment." + type = object({ + project : string + env : string + service : string + owner : string + managedBy : string + }) +} \ No newline at end of file diff --git a/databricks-s3-volume/versions.tf b/databricks-s3-volume/versions.tf new file mode 100644 index 00000000..159e8002 --- /dev/null +++ b/databricks-s3-volume/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + databricks = { + source = "databricks/databricks" + } + } + required_version = ">= 1.3.0" +}