This module Manages a Google Cloud Dataproc cluster resource, including IAM.
- Add support for Cloud Dataproc autoscaling policy.
module "dataproc-cluster" {
source = "./fabric/modules/dataproc"
project_id = var.project_id
name = "my-cluster"
region = var.region
}
# tftest modules=1 resources=1
To set cluster configuration use the 'dataproc_config.cluster_config' variable. If you don't want to use dedicated service account, remember to grant roles/dataproc.worker
to Compute Default Service Account.
module "dataproc-service-account" {
source = "./fabric/modules/iam-service-account"
project_id = var.project_id
name = "dataproc-worker"
iam_project_roles = {
(var.project_id) = ["roles/dataproc.worker"]
}
}
module "firewall" {
source = "./fabric/modules/net-vpc-firewall"
project_id = var.project_id
network = var.vpc.name
ingress_rules = {
allow-ingress-dataproc = {
description = "Allow all traffic between Dataproc nodes."
targets = ["dataproc"]
sources = ["dataproc"]
}
}
}
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
project_id = var.project_id
name = "my-cluster"
region = var.region
dataproc_config = {
cluster_config = {
gce_cluster_config = {
internal_ip_only = true
service_account = module.dataproc-service-account.email
service_account_scopes = ["cloud-platform"]
subnetwork = var.subnet.self_link
tags = ["dataproc"]
zone = "${var.region}-b"
}
}
}
depends_on = [
module.dataproc-service-account, # ensure all grants are done before creating the cluster
]
}
# tftest modules=3 resources=7 e2e
To set cluster configuration use the Customer Managed Encryption key, set dataproc_config.encryption_config.
variable. The Compute Engine service agent and the Cloud Storage service agent need to have CryptoKey Encrypter/Decrypter
role on they configured KMS key (Documentation).
module "project" {
source = "./fabric/modules/project"
name = "dataproc"
billing_account = var.billing_account_id
prefix = var.prefix
parent = var.folder_id
services = [
"cloudkms.googleapis.com",
"dataproc.googleapis.com",
"servicenetworking.googleapis.com",
]
}
module "kms" {
source = "./fabric/modules/kms"
project_id = module.project.project_id
keyring = {
location = var.region
name = "keyring"
}
keys = {
"key-regional" = {
}
}
iam = {
"roles/cloudkms.cryptoKeyEncrypterDecrypter" = [
module.project.service_agents.dataproc.iam_email
]
}
}
module "vpc" {
source = "./fabric/modules/net-vpc"
project_id = module.project.project_id
name = "my-network"
subnets = [
{
ip_cidr_range = "10.0.0.0/24"
name = "production"
region = var.region
},
]
psa_configs = [{
ranges = { myrange = "10.0.1.0/24" }
}]
}
module "dataproc-service-account" {
source = "./fabric/modules/iam-service-account"
project_id = module.project.project_id
name = "dataproc-worker"
iam_project_roles = {
(module.project.project_id) = ["roles/dataproc.worker", "roles/cloudkms.cryptoKeyEncrypterDecrypter"]
}
}
module "firewall" {
source = "./fabric/modules/net-vpc-firewall"
project_id = module.project.project_id
network = module.vpc.name
ingress_rules = {
allow-ingress-dataproc = {
description = "Allow all traffic between Dataproc nodes."
targets = ["dataproc"]
sources = ["dataproc"]
}
}
}
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
project_id = module.project.project_id
name = "my-cluster"
region = var.region
dataproc_config = {
cluster_config = {
gce_cluster_config = {
internal_ip_only = true
service_account = module.dataproc-service-account.email
service_account_scopes = ["cloud-platform"]
subnetwork = module.vpc.subnet_self_links["${var.region}/production"]
tags = ["dataproc"]
zone = "${var.region}-b"
}
}
encryption_config = {
kms_key_name = module.kms.keys.key-regional.id
}
}
}
# tftest modules=6 resources=28 e2e
To set cluster configuration GKE use the 'dataproc_config.virtual_cluster_config' variable. This example shows usage of dedicated Service Account.
locals {
dataproc_namespace = "foobar"
}
module "dataproc-service-account" {
source = "./fabric/modules/iam-service-account"
project_id = var.project_id
name = "dataproc-worker"
iam = {
"roles/iam.workloadIdentityUser" = [
"serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/agent]",
"serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-driver]",
"serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-executor]"
]
}
iam_project_roles = {
(var.project_id) = ["roles/dataproc.worker"]
}
depends_on = [
module.gke-cluster-standard, # granting workloadIdentityUser requires cluster/pool to be created first
]
}
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
project_id = var.project_id
name = "my-dataproc-cluster"
region = var.region
dataproc_config = {
virtual_cluster_config = {
kubernetes_cluster_config = {
kubernetes_namespace = local.dataproc_namespace
kubernetes_software_config = {
component_version = {
"SPARK" : "3.1-dataproc-14"
}
properties = {
"dataproc:dataproc.gke.agent.google-service-account" = module.dataproc-service-account.email
"dataproc:dataproc.gke.spark.driver.google-service-account" = module.dataproc-service-account.email
"dataproc:dataproc.gke.spark.executor.google-service-account" = module.dataproc-service-account.email
}
}
gke_cluster_config = {
gke_cluster_target = module.gke-cluster-standard.id
node_pool_target = {
node_pool = "node-pool-name"
roles = ["DEFAULT"]
}
}
}
}
}
}
# tftest modules=4 resources=6 fixtures=fixtures/gke-cluster-standard.tf e2e
IAM is managed via several variables that implement different features and levels of control:
iam
andiam_by_principals
configure authoritative bindings that manage individual roles exclusively, and are internally mergediam_bindings
configure authoritative bindings with optional support for conditions, and are not internally merged with the previous two variablesiam_bindings_additive
configure additive bindings via individual role/member pairs with optional support conditions
The authoritative and additive approaches can be used together, provided different roles are managed by each. Some care must also be taken with the iam_by_principals
variable to ensure that variable keys are static values, so that Terraform is able to compute the dependency graph.
Refer to the project module for examples of the IAM interface.
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
project_id = var.project_id
name = "my-cluster"
region = var.region
iam_by_principals = {
"group:[email protected]" = [
"roles/dataproc.viewer"
]
}
iam = {
"roles/dataproc.viewer" = [
"serviceAccount:service-account@PROJECT_ID.iam.gserviceaccount.com"
]
}
}
# tftest modules=1 resources=2
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
project_id = var.project_id
name = "my-cluster"
region = var.region
iam_bindings_additive = {
am1-viewer = {
member = "user:[email protected]"
role = "roles/dataproc.viewer"
}
}
}
# tftest modules=1 resources=2
name | description | type | required | default |
---|---|---|---|---|
name | Cluster name. | string |
✓ | |
project_id | Project ID. | string |
✓ | |
region | Dataproc region. | string |
✓ | |
dataproc_config | Dataproc cluster config. | object({…}) |
{} |
|
iam | IAM bindings in {ROLE => [MEMBERS]} format. | map(list(string)) |
{} |
|
iam_bindings | Authoritative IAM bindings in {KEY => {role = ROLE, members = [], condition = {}}}. Keys are arbitrary. | map(object({…})) |
{} |
|
iam_bindings_additive | Individual additive IAM bindings. Keys are arbitrary. | map(object({…})) |
{} |
|
iam_by_principals | Authoritative IAM binding in {PRINCIPAL => [ROLES]} format. Principals need to be statically defined to avoid cycle errors. Merged internally with the iam variable. |
map(list(string)) |
{} |
|
labels | The resource labels for instance to use to annotate any related underlying resources, such as Compute Engine VMs. | map(string) |
{} |
name | description | sensitive |
---|---|---|
id | Fully qualified cluster id. | |
name | The name of the cluster. |