diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d1949ee28..55b5021f5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
'--docstring-convention=numpy',
# 'PEP8 Rules' to ignore in tests. Ignore documentation rules for all tests
# and ignore long lines / whitespaces for e2e-tests where we define jsons in-code.
- '--per-file-ignores=**/tests/**.py:D docs/**.py:D e2e-tests/**.py:D,E501,W291,W293 docs/samples/deployments/spark/presidio_anonymize_blobs.py:F821,D103',
+ '--per-file-ignores=**/tests/**.py:D docs/**.py:D e2e-tests/**.py:D,E501,W291,W293 docs/samples/deployments/spark/presidio_anonymize_blobs.py:E501,F821,D103',
'--extend-ignore=
E203,
D100,
diff --git a/docs/samples/deployments/data-factory/adf-app-service-screenshot.png b/docs/samples/deployments/data-factory/adf-app-service-screenshot.png
new file mode 100644
index 000000000..e14f85cb5
Binary files /dev/null and b/docs/samples/deployments/data-factory/adf-app-service-screenshot.png differ
diff --git a/docs/samples/deployments/data-factory/adf-databricks-screenshot.png b/docs/samples/deployments/data-factory/adf-databricks-screenshot.png
new file mode 100644
index 000000000..f8787868d
Binary files /dev/null and b/docs/samples/deployments/data-factory/adf-databricks-screenshot.png differ
diff --git a/docs/samples/deployments/data-factory/azure-deploy-adf-app-service.json b/docs/samples/deployments/data-factory/azure-deploy-adf-app-service.json
new file mode 100644
index 000000000..43b2362bd
--- /dev/null
+++ b/docs/samples/deployments/data-factory/azure-deploy-adf-app-service.json
@@ -0,0 +1,567 @@
+{
+ "$schema": "http://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+ "contentVersion": "1.0.0.0",
+ "parameters": {
+ "dataFactoryName": {
+ "type": "string",
+ "metadata": {
+ "description": "Data Factory name"
+ },
+ "defaultValue": "[concat('presidioadf', uniqueString(resourceGroup().id))]"
+ },
+ "AzureBlobStorage_accountName": {
+ "type": "string",
+ "metadata": {
+ "description": "Azure Storage Account Name"
+ },
+ "defaultValue": "[concat('presidio', uniqueString(resourceGroup().id))]"
+ },
+ "AzureBlobStorage_cotainerName": {
+ "type": "string",
+ "metadata": {
+ "description": "Azure Storage Container Name"
+ },
+ "defaultValue": "presidio"
+ },
+ "AzureKeyVault_name": {
+ "type": "string",
+ "metadata": {
+ "description": "Azure Key Vault Name"
+ },
+ "defaultValue": "[concat('presidioakv', uniqueString(resourceGroup().id))]"
+ },
+ "AzureKeyVault_storageAccessKeySecretName": {
+ "type": "string",
+ "metadata": {
+ "description": "Name of storage access key secert in Key Vault"
+ },
+ "defaultValue": "access-token"
+ },
+ "AzureKeyVault_storageSASSecretName": {
+ "type": "string",
+ "metadata": {
+ "description": "Name of storage SAS token secert in Key Vault"
+ },
+ "defaultValue": "sas-token"
+ },
+ "Analyzer_appName": {
+ "type": "string",
+ "metadata": {
+ "description": "Analyzer App Service Name"
+ },
+ "defaultValue": "[concat('presidio-analyzer', uniqueString(resourceGroup().id))]"
+ },
+ "Anonymizer_appName": {
+ "type": "string",
+ "metadata": {
+ "description": "Anonymizer App Service Name"
+ },
+ "defaultValue": "[concat('presidio-anonymizer', uniqueString(resourceGroup().id))]"
+ },
+ "accountSasProperties": {
+ "type": "object",
+ "defaultValue": {
+ "signedServices": "b",
+ "signedPermission": "rw",
+ "signedExpiry": "[dateTimeAdd(utcNow('u'), 'P3D')]",
+ "signedResourceTypes": "o"
+ }
+ }
+ },
+ "variables": {
+ "AzureBlobStorage_connectionString": "[concat('DefaultEndpointsProtocol=https;AccountName=', parameters('AzureBlobStorage_accountName'), ';EndpointSuffix=core.windows.net')]",
+ "factoryId": "[concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName'))]",
+ "Dataset_url": "https://raw.githubusercontent.com/microsoft/presidio-research/master/tests/data/generated_large.txt",
+ "Analyzer_url": "[concat('https://', parameters('Analyzer_appName'), '.azurewebsites.net/analyze')]",
+ "Anonymizer_url": "[concat('https://', parameters('Anonymizer_appName'), '.azurewebsites.net/anonymize')]",
+ "AzureKeyVault_baseUrl": "[concat('https://', parameters('AzureKeyVault_name'), '.vault.azure.net/')]"
+ },
+ "resources": [
+ {
+ "type": "Microsoft.Resources/deployments",
+ "apiVersion": "2019-10-01",
+ "name": "presidio-app-services",
+ "properties": {
+ "mode": "Incremental",
+ "templateLink": {
+ "uri": "https://raw.githubusercontent.com/microsoft/presidio/main/docs/samples/deployments/app-service/presidio-services.json",
+ "contentVersion": "1.0.0.0"
+ },
+ "parameters": {
+ "analyzerWebAppName": {
+ "value": "[parameters('Analyzer_appName')]"
+ },
+ "anonymizerWebAppName": {
+ "value": "[parameters('Anonymizer_appName')]"
+ },
+ "location": {
+ "value": "[resourceGroup().location]"
+ }
+ }
+ }
+ },
+ {
+ "type": "Microsoft.Storage/storageAccounts",
+ "apiVersion": "2018-07-01",
+ "name": "[parameters('AzureBlobStorage_accountName')]",
+ "location": "[resourceGroup().location]",
+ "tags": {
+ "displayName": "[parameters('AzureBlobStorage_accountName')]"
+ },
+ "sku": {
+ "name": "Standard_LRS"
+ },
+ "kind": "StorageV2",
+ "resources": [
+ {
+ "type": "blobServices/containers",
+ "apiVersion": "2018-03-01-preview",
+ "name": "[concat('default/', parameters('AzureBlobStorage_cotainerName'))]",
+ "dependsOn": [
+ "[parameters('AzureBlobStorage_accountName')]"
+ ]
+ }
+ ]
+ },
+ {
+ "type": "Microsoft.DataFactory/factories",
+ "apiVersion": "2018-06-01",
+ "name": "[parameters('dataFactoryName')]",
+ "location": "[resourceGroup().location]",
+ "properties": {},
+ "identity": {
+ "type": "SystemAssigned"
+ },
+ "resources": [
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/AzureBlobStorage')]",
+ "type": "Microsoft.DataFactory/factories/linkedServices",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "annotations": [],
+ "type": "AzureBlobStorage",
+ "typeProperties": {
+ "connectionString": "[variables('AzureBlobStorage_connectionString')]",
+ "accountKey": {
+ "type": "AzureKeyVaultSecret",
+ "store": {
+ "referenceName": "StorageSecretsKeyVault",
+ "type": "LinkedServiceReference"
+ },
+ "secretName": "[parameters('AzureKeyVault_storageAccessKeySecretName')]"
+ }
+ }
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/linkedServices/StorageSecretsKeyVault')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/datasetGithub')]",
+ "type": "Microsoft.DataFactory/factories/linkedServices",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "annotations": [],
+ "type": "RestService",
+ "typeProperties": {
+ "url": "[variables('Dataset_url')]",
+ "enableServerCertificateValidation": true,
+ "authenticationType": "Anonymous"
+ }
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/StorageSecretsKeyVault')]",
+ "type": "Microsoft.DataFactory/factories/linkedServices",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "annotations": [],
+ "type": "AzureKeyVault",
+ "typeProperties": {
+ "baseUrl": "[variables('AzureKeyVault_baseUrl')]"
+ }
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/GithubDataSet')]",
+ "type": "Microsoft.DataFactory/factories/datasets",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "description": "JSON Dataset on GitHub",
+ "linkedServiceName": {
+ "referenceName": "datasetGithub",
+ "type": "LinkedServiceReference"
+ },
+ "annotations": [],
+ "type": "RestResource",
+ "typeProperties": {},
+ "schema": []
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/linkedServices/datasetGithub')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/BlobStorageDataset')]",
+ "type": "Microsoft.DataFactory/factories/datasets",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "description": "JSON Dataset on Azure Blob Storage",
+ "linkedServiceName": {
+ "referenceName": "AzureBlobStorage",
+ "type": "LinkedServiceReference"
+ },
+ "annotations": [],
+ "type": "Json",
+ "typeProperties": {
+ "location": {
+ "type": "AzureBlobStorageLocation",
+ "fileName": "sentances.json",
+ "folderPath": "dataset",
+ "container": "[parameters('AzureBlobStorage_cotainerName')]"
+ }
+ },
+ "schema": {}
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/linkedServices/AzureBlobStorage')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/Anonymize')]",
+ "type": "Microsoft.DataFactory/factories/pipelines",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "activities": [
+ {
+ "name": "GetDataSet",
+ "description": "Get the data set from GitHub to Azure Blob Storage",
+ "type": "Copy",
+ "dependsOn": [],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "source": {
+ "type": "RestSource",
+ "httpRequestTimeout": "00:01:40",
+ "requestInterval": "00.00:00:00.010",
+ "requestMethod": "GET"
+ },
+ "sink": {
+ "type": "JsonSink",
+ "storeSettings": {
+ "type": "AzureBlobStorageWriteSettings"
+ },
+ "formatSettings": {
+ "type": "JsonWriteSettings"
+ }
+ },
+ "enableStaging": false,
+ "translator": {
+ "type": "TabularTranslator",
+ "mappings": [
+ {
+ "source": {
+ "path": "$['full_text']"
+ },
+ "sink": {
+ "path": "full_text"
+ }
+ }
+ ]
+ }
+ },
+ "inputs": [
+ {
+ "referenceName": "GithubDataSet",
+ "type": "DatasetReference",
+ "parameters": {}
+ }
+ ],
+ "outputs": [
+ {
+ "referenceName": "BlobStorageDataset",
+ "type": "DatasetReference",
+ "parameters": {}
+ }
+ ]
+ },
+ {
+ "name": "LoadSet",
+ "description": "Load the data set to array for processing",
+ "type": "Lookup",
+ "dependsOn": [
+ {
+ "activity": "GetDataSet",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ }
+ ],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "source": {
+ "type": "JsonSource",
+ "storeSettings": {
+ "type": "AzureBlobStorageReadSettings",
+ "recursive": false,
+ "enablePartitionDiscovery": false
+ },
+ "formatSettings": {
+ "type": "JsonReadSettings"
+ }
+ },
+ "dataset": {
+ "referenceName": "BlobStorageDataset",
+ "type": "DatasetReference",
+ "parameters": {}
+ },
+ "firstRowOnly": false
+ }
+ },
+ {
+ "name": "SaveSet",
+ "description": "Save each json document as a separate blob",
+ "type": "ForEach",
+ "dependsOn": [
+ {
+ "activity": "LoadSet",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ },
+ {
+ "activity": "GetSASToken",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ }
+ ],
+ "userProperties": [],
+ "typeProperties": {
+ "items": {
+ "value": "@activity('LoadSet').output.value",
+ "type": "Expression"
+ },
+ "activities": [
+ {
+ "name": "PresidioAnalyze",
+ "description": "Analyze text with presidio",
+ "type": "WebActivity",
+ "dependsOn": [],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "url": {
+ "value": "[variables('Analyzer_url')]",
+ "type": "Expression"
+ },
+ "method": "POST",
+ "headers": {
+ "Content-Type": "application/json"
+ },
+ "body": {
+ "value": "@concat('{\"text\":\"',replace(item().full_text, '\"', '\\\"'),'\",\"language\":\"en\"}')",
+ "type": "Expression"
+ }
+ }
+ },
+ {
+ "name": "PresidioAnonymize",
+ "type": "WebActivity",
+ "description": "Anonymize text with presidio",
+ "dependsOn": [
+ {
+ "activity": "PresidioAnalyze",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ }
+ ],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "url": {
+ "value": "[variables('Anonymizer_url')]",
+ "type": "Expression"
+ },
+ "method": "POST",
+ "headers": {
+ "Content-Type": "application/json"
+ },
+ "body": {
+ "value": "@concat('{\"text\":\"',replace(item().full_text, '\"', '\\\"'), '\",\"anonymizers\": {\"DEFAULT\": { \"type\": \"replace\", \"new_value\": \"ANONYMIZED\" }},\"analyzer_results\": ', activity('PresidioAnalyze').output. Response,'}')",
+ "type": "Expression"
+ }
+ }
+ },
+ {
+ "name": "UploadBlob",
+ "type": "WebActivity",
+ "dependsOn": [
+ {
+ "activity": "PresidioAnonymize",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ }
+ ],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "url": {
+ "value": "[concat('@concat(''https://', parameters('AzureBlobStorage_accountName'), '.blob.core.windows.net/', parameters('AzureBlobStorage_cotainerName'), '/output/'', ''file'', string(rand(0,1000)), ''.txt?'', activity(''GetSASToken'').output.value)')]",
+ "type": "Expression"
+ },
+ "method": "PUT",
+ "headers": {
+ "x-ms-blob-type": "BlockBlob"
+ },
+ "body": {
+ "value": "@activity('PresidioAnonymize').output.result",
+ "type": "Expression"
+ }
+ }
+ }
+ ]
+ }
+ },
+ {
+ "name": "GetSASToken",
+ "description": "Get storage account SAS Token from Azure Key Vault",
+ "type": "WebActivity",
+ "dependsOn": [],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "url": "[concat(variables('AzureKeyVault_baseUrl'), 'secrets/', parameters('AzureKeyVault_storageSASSecretName'), '?api-version=7.0')]",
+ "method": "GET",
+ "headers": {},
+ "authentication": {
+ "type": "MSI",
+ "resource": "https://vault.azure.net"
+ }
+ }
+ }
+ ],
+ "annotations": [],
+ "lastPublishTime": "2021-03-15T17:44:33Z"
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/datasets/GithubDataSet')]",
+ "[concat(variables('factoryId'), '/datasets/BlobStorageDataset')]"
+ ]
+ }
+ ]
+ },
+ {
+ "type": "Microsoft.KeyVault/vaults",
+ "apiVersion": "2018-02-14",
+ "name": "[parameters('AzureKeyVault_name')]",
+ "location": "[resourceGroup().location]",
+ "tags": {
+ "displayName": "[parameters('AzureKeyVault_name')]"
+ },
+ "dependsOn": [
+ "[concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName'))]"
+ ],
+ "properties": {
+ "enabledForDeployment": true,
+ "enabledForTemplateDeployment": true,
+ "enabledForDiskEncryption": true,
+ "tenantId": "[subscription().tenantId]",
+ "accessPolicies": [
+ {
+ "tenantId": "[subscription().tenantId]",
+ "objectId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.principalId]",
+ "permissions": {
+ "keys": [
+ "get"
+ ],
+ "secrets": [
+ "list",
+ "get",
+ "set"
+ ]
+ }
+ }
+ ],
+ "sku": {
+ "name": "standard",
+ "family": "A"
+ }
+ }
+ },
+ {
+ "apiVersion": "2018-02-14",
+ "type": "Microsoft.KeyVault/vaults/secrets",
+ "dependsOn": [
+ "[concat('Microsoft.KeyVault/vaults/', parameters('AzureKeyVault_name'))]",
+ "[concat('Microsoft.Storage/storageAccounts/', parameters('AzureBlobStorage_accountName'))]"
+ ],
+ "name": "[concat(parameters('AzureKeyVault_name'), '/', parameters('AzureKeyVault_storageSASSecretName'))]",
+ "properties": {
+ "value": "[listAccountSas(parameters('AzureBlobStorage_accountName'), '2018-07-01', parameters('accountSasProperties')).accountSasToken]"
+ }
+ },
+ {
+ "apiVersion": "2018-02-14",
+ "type": "Microsoft.KeyVault/vaults/secrets",
+ "dependsOn": [
+ "[concat('Microsoft.KeyVault/vaults/', parameters('AzureKeyVault_name'))]",
+ "[concat('Microsoft.Storage/storageAccounts/', parameters('AzureBlobStorage_accountName'))]"
+ ],
+ "name": "[concat(parameters('AzureKeyVault_name'), '/', parameters('AzureKeyVault_storageAccessKeySecretName'))]",
+ "properties": {
+ "value": "[listKeys(parameters('AzureBlobStorage_accountName'), '2018-07-01').keys[0].value]"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/samples/deployments/data-factory/azure-deploy-adf-databricks.json b/docs/samples/deployments/data-factory/azure-deploy-adf-databricks.json
new file mode 100644
index 000000000..b11ee931b
--- /dev/null
+++ b/docs/samples/deployments/data-factory/azure-deploy-adf-databricks.json
@@ -0,0 +1,568 @@
+{
+ "$schema": "http://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+ "contentVersion": "1.0.0.0",
+ "parameters": {
+ "dataFactoryName": {
+ "type": "string",
+ "metadata": {
+ "description": "Data Factory name"
+ },
+ "defaultValue": "[concat('presidioadf', uniqueString(resourceGroup().id))]"
+ },
+ "Databricks_accessToken": {
+ "type": "securestring",
+ "metadata": {
+ "description": "Secure string for 'accessToken' of 'PresidioDatabricks'"
+ }
+ },
+ "Databricks_workSpaceUrl": {
+ "type": "string",
+ "metadata": {
+ "description": "The databricks workspace URL"
+ }
+ },
+ "Databricks_clusterId": {
+ "type": "string",
+ "metadata": {
+ "description": "ID of a presidio-ready databricks cluster"
+ }
+ },
+ "Databricks_notebookLocation": {
+ "type": "string",
+ "metadata": {
+ "description": "Path to notebook on databricks"
+ }
+ },
+ "AzureBlobStorage_accountName": {
+ "type": "string",
+ "metadata": {
+ "description": "Azure Storage Account Name"
+ },
+ "defaultValue": "[concat('presidio', uniqueString(resourceGroup().id))]"
+ },
+ "AzureBlobStorage_cotainerName": {
+ "type": "string",
+ "metadata": {
+ "description": "Azure Storage Container Name"
+ },
+ "defaultValue": "presidio"
+ },
+ "AzureKeyVault_name": {
+ "type": "string",
+ "metadata": {
+ "description": "Azure Key Vault Name"
+ },
+ "defaultValue": "[concat('presidioakv', uniqueString(resourceGroup().id))]"
+ },
+ "AzureKeyVault_storageAccessKeySecretName": {
+ "type": "string",
+ "metadata": {
+ "description": "Name of storage access key secert in Key Vault"
+ },
+ "defaultValue": "access-token"
+ },
+ "AzureKeyVault_storageSASSecretName": {
+ "type": "string",
+ "metadata": {
+ "description": "Name of storage SAS token secert in Key Vault"
+ },
+ "defaultValue": "sas-token"
+ },
+ "accountSasProperties": {
+ "type": "object",
+ "defaultValue": {
+ "signedServices": "b",
+ "signedPermission": "rw",
+ "signedExpiry": "[dateTimeAdd(utcNow('u'), 'P3D')]",
+ "signedResourceTypes": "o"
+ }
+ }
+ },
+ "variables": {
+ "AzureBlobStorage_connectionString": "[concat('DefaultEndpointsProtocol=https;AccountName=', parameters('AzureBlobStorage_accountName'), ';EndpointSuffix=core.windows.net')]",
+ "factoryId": "[concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName'))]",
+ "Dataset_url": "https://raw.githubusercontent.com/microsoft/presidio-research/master/tests/data/generated_large.txt",
+ "AzureKeyVault_baseUrl": "[concat('https://', parameters('AzureKeyVault_name'), '.vault.azure.net/')]"
+ },
+ "resources": [
+ {
+ "type": "Microsoft.Storage/storageAccounts",
+ "apiVersion": "2018-07-01",
+ "name": "[parameters('AzureBlobStorage_accountName')]",
+ "location": "[resourceGroup().location]",
+ "tags": {
+ "displayName": "[parameters('AzureBlobStorage_accountName')]"
+ },
+ "sku": {
+ "name": "Standard_LRS"
+ },
+ "kind": "StorageV2",
+ "resources": [
+ {
+ "type": "blobServices/containers",
+ "apiVersion": "2018-03-01-preview",
+ "name": "[concat('default/', parameters('AzureBlobStorage_cotainerName'))]",
+ "dependsOn": [
+ "[parameters('AzureBlobStorage_accountName')]"
+ ]
+ }
+ ]
+ },
+ {
+ "type": "Microsoft.DataFactory/factories",
+ "apiVersion": "2018-06-01",
+ "name": "[parameters('dataFactoryName')]",
+ "location": "[resourceGroup().location]",
+ "properties": {},
+ "identity": {
+ "type": "SystemAssigned"
+ },
+ "resources": [
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/PresidioDatabricks')]",
+ "type": "Microsoft.DataFactory/factories/linkedServices",
+ "apiVersion": "2018-06-01",
+ "dependsOn": [
+ "[parameters('dataFactoryName')]"
+ ],
+ "properties": {
+ "annotations": [],
+ "type": "AzureDatabricks",
+ "typeProperties": {
+ "domain": "[concat('https://', parameters('Databricks_workSpaceUrl'))]",
+ "accessToken": {
+ "type": "SecureString",
+ "value": "[parameters('Databricks_accessToken')]"
+ },
+ "existingClusterId": "[parameters('Databricks_clusterId')]"
+ }
+ }
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/AzureBlobStorage')]",
+ "type": "Microsoft.DataFactory/factories/linkedServices",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "annotations": [],
+ "type": "AzureBlobStorage",
+ "typeProperties": {
+ "connectionString": "[variables('AzureBlobStorage_connectionString')]",
+ "accountKey": {
+ "type": "AzureKeyVaultSecret",
+ "store": {
+ "referenceName": "StorageSecretsKeyVault",
+ "type": "LinkedServiceReference"
+ },
+ "secretName": "[parameters('AzureKeyVault_storageAccessKeySecretName')]"
+ }
+ }
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/linkedServices/StorageSecretsKeyVault')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/datasetGithub')]",
+ "type": "Microsoft.DataFactory/factories/linkedServices",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "annotations": [],
+ "type": "RestService",
+ "typeProperties": {
+ "url": "[variables('Dataset_url')]",
+ "enableServerCertificateValidation": true,
+ "authenticationType": "Anonymous"
+ }
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/StorageSecretsKeyVault')]",
+ "type": "Microsoft.DataFactory/factories/linkedServices",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "annotations": [],
+ "type": "AzureKeyVault",
+ "typeProperties": {
+ "baseUrl": "[variables('AzureKeyVault_baseUrl')]"
+ }
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/GithubDataSet')]",
+ "type": "Microsoft.DataFactory/factories/datasets",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "description": "JSON Dataset on GitHub",
+ "linkedServiceName": {
+ "referenceName": "datasetGithub",
+ "type": "LinkedServiceReference"
+ },
+ "annotations": [],
+ "type": "RestResource",
+ "typeProperties": {},
+ "schema": []
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/linkedServices/datasetGithub')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/BlobStorageDataset')]",
+ "type": "Microsoft.DataFactory/factories/datasets",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "description": "JSON Dataset on Azure Blob Storage",
+ "linkedServiceName": {
+ "referenceName": "AzureBlobStorage",
+ "type": "LinkedServiceReference"
+ },
+ "annotations": [],
+ "type": "Json",
+ "typeProperties": {
+ "location": {
+ "type": "AzureBlobStorageLocation",
+ "fileName": "sentances.json",
+ "folderPath": "dataset",
+ "container": "[parameters('AzureBlobStorage_cotainerName')]"
+ }
+ },
+ "schema": {}
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/linkedServices/AzureBlobStorage')]"
+ ]
+ },
+ {
+ "name": "[concat(parameters('dataFactoryName'), '/Anonymize')]",
+ "type": "Microsoft.DataFactory/factories/pipelines",
+ "apiVersion": "2018-06-01",
+ "properties": {
+ "activities": [
+ {
+ "name": "Presidio-Anonymize",
+ "description": "Anonymize files using Presidio",
+ "type": "DatabricksNotebook",
+ "dependsOn": [
+ {
+ "activity": "GetSecret",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ },
+ {
+ "activity": "SaveBlobs",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ }
+ ],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "notebookPath": "[parameters('Databricks_notebookLocation')]",
+ "baseParameters": {
+ "storage_account_name": "[parameters('AzureBlobStorage_accountName')]",
+ "storage_container_name": "[parameters('AzureBlobStorage_cotainerName')]",
+ "storage_account_access_key": {
+ "value": "@activity('GetSecret').output.value",
+ "type": "Expression"
+ }
+ }
+ },
+ "linkedServiceName": {
+ "referenceName": "PresidioDatabricks",
+ "type": "LinkedServiceReference"
+ }
+ },
+ {
+ "name": "GetSecret",
+ "description": "Get storage account key from Azure Key Vault",
+ "type": "WebActivity",
+ "dependsOn": [],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "url": "[concat(variables('AzureKeyVault_baseUrl'), 'secrets/', parameters('AzureKeyVault_storageAccessKeySecretName'), '?api-version=7.0')]",
+ "method": "GET",
+ "headers": {},
+ "authentication": {
+ "type": "MSI",
+ "resource": "https://vault.azure.net"
+ }
+ }
+ },
+ {
+ "name": "GetDataSet",
+ "description": "Get the data set from GitHub to Azure Blob Storage",
+ "type": "Copy",
+ "dependsOn": [],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "source": {
+ "type": "RestSource",
+ "httpRequestTimeout": "00:01:40",
+ "requestInterval": "00.00:00:00.010",
+ "requestMethod": "GET"
+ },
+ "sink": {
+ "type": "JsonSink",
+ "storeSettings": {
+ "type": "AzureBlobStorageWriteSettings"
+ },
+ "formatSettings": {
+ "type": "JsonWriteSettings"
+ }
+ },
+ "enableStaging": false,
+ "translator": {
+ "type": "TabularTranslator",
+ "mappings": [
+ {
+ "source": {
+ "path": "$['full_text']"
+ },
+ "sink": {
+ "path": "full_text"
+ }
+ }
+ ]
+ }
+ },
+ "inputs": [
+ {
+ "referenceName": "GithubDataSet",
+ "type": "DatasetReference",
+ "parameters": {}
+ }
+ ],
+ "outputs": [
+ {
+ "referenceName": "BlobStorageDataset",
+ "type": "DatasetReference",
+ "parameters": {}
+ }
+ ]
+ },
+ {
+ "name": "LoadSet",
+ "description": "Load the data set to array for processing",
+ "type": "Lookup",
+ "dependsOn": [
+ {
+ "activity": "GetDataSet",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ }
+ ],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "source": {
+ "type": "JsonSource",
+ "storeSettings": {
+ "type": "AzureBlobStorageReadSettings",
+ "recursive": false,
+ "enablePartitionDiscovery": false
+ },
+ "formatSettings": {
+ "type": "JsonReadSettings"
+ }
+ },
+ "dataset": {
+ "referenceName": "BlobStorageDataset",
+ "type": "DatasetReference",
+ "parameters": {}
+ },
+ "firstRowOnly": false
+ }
+ },
+ {
+ "name": "SaveBlobs",
+ "description": "Save each json document as a separate blob",
+ "type": "ForEach",
+ "dependsOn": [
+ {
+ "activity": "LoadSet",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ },
+ {
+ "activity": "GetSASToken",
+ "dependencyConditions": [
+ "Succeeded"
+ ]
+ }
+ ],
+ "userProperties": [],
+ "typeProperties": {
+ "items": {
+ "value": "@activity('LoadSet').output.value",
+ "type": "Expression"
+ },
+ "activities": [
+ {
+ "name": "UploadBlob",
+ "type": "WebActivity",
+ "dependsOn": [],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "url": {
+ "value": "[concat('@concat(''https://', parameters('AzureBlobStorage_accountName'), '.blob.core.windows.net/', parameters('AzureBlobStorage_cotainerName'), '/input/'', ''file'', string(rand(0,1000)), ''.txt?'', activity(''GetSASToken'').output.value)')]",
+ "type": "Expression"
+ },
+ "method": "PUT",
+ "headers": {
+ "x-ms-blob-type": "BlockBlob"
+ },
+ "body": {
+ "value": "@item().full_text",
+ "type": "Expression"
+ }
+ }
+ }
+ ]
+ }
+ },
+ {
+ "name": "GetSASToken",
+ "description": "Get storage account SAS Token from Azure Key Vault",
+ "type": "WebActivity",
+ "dependsOn": [],
+ "policy": {
+ "timeout": "7.00:00:00",
+ "retry": 0,
+ "retryIntervalInSeconds": 30,
+ "secureOutput": false,
+ "secureInput": false
+ },
+ "userProperties": [],
+ "typeProperties": {
+ "url": "[concat(variables('AzureKeyVault_baseUrl'), 'secrets/', parameters('AzureKeyVault_storageSASSecretName'), '?api-version=7.0')]",
+ "method": "GET",
+ "headers": {},
+ "authentication": {
+ "type": "MSI",
+ "resource": "https://vault.azure.net"
+ }
+ }
+ }
+ ],
+ "annotations": [],
+ "lastPublishTime": "2021-03-15T17:44:33Z"
+ },
+ "dependsOn": [
+ "[parameters('dataFactoryName')]",
+ "[concat(variables('factoryId'), '/linkedServices/PresidioDatabricks')]",
+ "[concat(variables('factoryId'), '/datasets/GithubDataSet')]",
+ "[concat(variables('factoryId'), '/datasets/BlobStorageDataset')]"
+ ]
+ }
+ ]
+ },
+ {
+ "type": "Microsoft.KeyVault/vaults",
+ "apiVersion": "2018-02-14",
+ "name": "[parameters('AzureKeyVault_name')]",
+ "location": "[resourceGroup().location]",
+ "tags": {
+ "displayName": "[parameters('AzureKeyVault_name')]"
+ },
+ "dependsOn": [
+ "[concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName'))]"
+ ],
+ "properties": {
+ "enabledForDeployment": true,
+ "enabledForTemplateDeployment": true,
+ "enabledForDiskEncryption": true,
+ "tenantId": "[subscription().tenantId]",
+ "accessPolicies": [
+ {
+ "tenantId": "[subscription().tenantId]",
+ "objectId": "[reference(concat('Microsoft.DataFactory/factories/', parameters('dataFactoryName')), '2018-06-01', 'Full').identity.principalId]",
+ "permissions": {
+ "keys": [
+ "get"
+ ],
+ "secrets": [
+ "list",
+ "get",
+ "set"
+ ]
+ }
+ }
+ ],
+ "sku": {
+ "name": "standard",
+ "family": "A"
+ }
+ }
+ },
+ {
+ "apiVersion": "2018-02-14",
+ "type": "Microsoft.KeyVault/vaults/secrets",
+ "dependsOn": [
+ "[concat('Microsoft.KeyVault/vaults/', parameters('AzureKeyVault_name'))]",
+ "[concat('Microsoft.Storage/storageAccounts/', parameters('AzureBlobStorage_accountName'))]"
+ ],
+ "name": "[concat(parameters('AzureKeyVault_name'), '/', parameters('AzureKeyVault_storageSASSecretName'))]",
+ "properties": {
+ "value": "[listAccountSas(parameters('AzureBlobStorage_accountName'), '2018-07-01', parameters('accountSasProperties')).accountSasToken]"
+ }
+ },
+ {
+ "apiVersion": "2018-02-14",
+ "type": "Microsoft.KeyVault/vaults/secrets",
+ "dependsOn": [
+ "[concat('Microsoft.KeyVault/vaults/', parameters('AzureKeyVault_name'))]",
+ "[concat('Microsoft.Storage/storageAccounts/', parameters('AzureBlobStorage_accountName'))]"
+ ],
+ "name": "[concat(parameters('AzureKeyVault_name'), '/', parameters('AzureKeyVault_storageAccessKeySecretName'))]",
+ "properties": {
+ "value": "[listKeys(parameters('AzureBlobStorage_accountName'), '2018-07-01').keys[0].value]"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/samples/deployments/data-factory/index.md b/docs/samples/deployments/data-factory/index.md
new file mode 100644
index 000000000..e5c8839f7
--- /dev/null
+++ b/docs/samples/deployments/data-factory/index.md
@@ -0,0 +1,72 @@
+# Anonymize PII entities in an Azure Data Factory ETL Pipeline
+
+You can build data anonymization ETL pipelines using Azure Data Factory (ADF) and Presidio.
+The following samples showcase two scenarios which use ADF to move a set of JSON objects from an online location to an Azure Storage while anonymizing their content.
+The first sample leverages the code for using [Presidio on Azure App Service](../app-service/index.md) to call Presidio as an HTTP REST endpoint in the ADF pipeline while parsing and storing each file as an Azure Blob Storage.
+The second sample leverage the code for using [Presidio on spark](../spark/index.md) to run over a set of files on an Azure Blob Storage to anonymnize their content, in the case of having a large data set that requires the scale of databricks.
+
+The samples use the following Azure Services:
+
+* Azure Data Factory - Host and orchestrate the transformation pipeline.
+* Azure KeyVault - Holds the access keys for Azure Storage to avoid having keys and secrets in the code.
+* Azure Storage - Persistence layer of this sample.
+* Azure Databricks/ Azure App Service - Host presidio to anonymize the data.
+
+The input file used by the samples is hosted on [presidio-research](https://github.com/microsoft/presidio-research/) repository. It is setup as a variable on the provided ARM template and used by Azure Data Factory as the input source.
+
+## Option 1: Presidio as an HTTP REST endpoint
+
+By using Presidio as an HTTP endpoint, the user can select which infrastructure best suits their requirements. in this sample, Presidio is deployed to an Azure App Service, but other deployment targets can be used, such as [kubernetes](../k8s/index.md).
+
+
+
+
+### Deploy the ARM template
+
+Create the Azure App Service and the ADF pipeline by clicking the Deploy-to-Azure button, or by running the following script to provision the [provided ARM template](./azure-deploy-adf-app-service.json).
+
+[](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2Fpresidio%2Fmain%2Fdocs%2Fsamples%2Fdeployments%2Fdata-factory%2Fazure-deploy-adf-app-service.json)
+
+
+```bash
+RESOURCE_GROUP=[Name of resource group]
+LOCATION=[location of resources]
+
+az group create --name $RESOURCE_GROUP --location $LOCATION
+az deployment group create -g $RESOURCE_GROUP --template-file ./azure-deploy-adf-app-service.json
+```
+
+Note that:
+
+* A SAS token keys is created and read from Azure Storage and then imported to Azure Key Vault. Using ARM template built in [functions](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/template-functions): [listAccountSas](https://docs.microsoft.com/en-us/rest/api/storagerp/storageaccounts/listaccountsas).
+* An access policy grants the Azure Data Factory managed identity access to the Azure Key Vault by using ARM template [reference](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/template-functions-resource?tabs=json#reference) function to the Data Factory object and acquire its identity.principalId property. This is enabled by setting the data factory ARM resource's identity attribute to managed identity (SystemAssigned).
+
+## Option 2: Presidio on Azure Databricks
+
+By using Presidio as a Notebook step in ADF, we allow Databricks to scale presidio according to the cluster capabilities and the input dataset. Using presidio as a native python package in pyspark can unlock more analysis and de-identifiaction scenarios.
+
+
+
+### Pre-requisite - Deploy Azure Databricks
+
+Provision and setup the datbricks cluster by following the steps in [presidio-spark sample](../spark/index.md#Azure-Databricks).
+**Note** that you should only create and configure the databricks cluster and not the storage account, which will be created in the next step.
+
+### Deploy the ARM template
+
+Create the rest of the services by running the following script which uses the [provided ARM template](./azure-deploy-adf-databricks.json).
+
+```bash
+RESOURCE_GROUP=[Name of resource group]
+LOCATION=[location of resources]
+DATABRICKS_ACCESS_TOKEN=[Access token to databricks created in the presidio-spark sample]
+DATABRICKS_WORKSPACE_URL=[Databricks workspace URL without the https:// prefix]
+DATABRICKS_CLUSTER_ID=[Databricks presidio-ready cluster ID]
+DATABRICKS_NOTEBOOK_LOCATION=[Location of presidio notebook from the presidio-spark sample]
+
+az group create --name $RESOURCE_GROUP --location $LOCATION
+az deployment group create -g $RESOURCE_GROUP --template-file ./azure-deploy-adf-databricks.json --parameters Databricks_accessToken=$DATABRICKS_ACCESS_TOKEN Databricks_clusterId=$DATABRICKS_CLUSTER_ID Databricks_notebookLocation=$DATABRICKS_NOTEBOOK_LOCATION Databricks_workSpaceUrl=$DATABRICKS_WORKSPACE_URL
+```
+
+Note that:
+Two keys are read from Azure Storage and imported to Azure Key Vault, the account Access Token and a SAS token, using ARM template built in [functions](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/template-functions): [listAccountSas](https://docs.microsoft.com/en-us/rest/api/storagerp/storageaccounts/listaccountsas) and [listKeys](https://docs.microsoft.com/en-us/rest/api/storagerp/storageaccounts/listkeys).
diff --git a/docs/samples/deployments/index.md b/docs/samples/deployments/index.md
index 7b3f161f6..6224909c8 100644
--- a/docs/samples/deployments/index.md
+++ b/docs/samples/deployments/index.md
@@ -3,3 +3,4 @@
- [Azure App Service](app-service/index.md)
- [Kubernetes](k8s/index.md)
- [Spark/Azure Databricks](spark/index.md)
+- [Azure Data Factory](data-factory/index.md)
diff --git a/docs/samples/deployments/spark/index.md b/docs/samples/deployments/spark/index.md
index af99a381c..90e382ab5 100644
--- a/docs/samples/deployments/spark/index.md
+++ b/docs/samples/deployments/spark/index.md
@@ -21,8 +21,7 @@ LOCATION=[location]
# Create the storage account
az group create --name $RESOURCE_GROUP --location $LOCATION
-az storage account create --name $STORAGE_ACCOUNT_NAME --resource-group
-$RESOURCE_GROUP
+az storage account create --name $STORAGE_ACCOUNT_NAME --resource-group $RESOURCE_GROUP
# Get the storage account access key
STORAGE_ACCESS_KEY=$(az storage account keys list --account-name $STORAGE_ACCOUNT_NAME --resource-group $RESOURCE_GROUP --query '[0].value' -o tsv)
diff --git a/docs/samples/deployments/spark/presidio_anonymize_blobs.py b/docs/samples/deployments/spark/presidio_anonymize_blobs.py
index 34cf9beb4..b5d3c7478 100644
--- a/docs/samples/deployments/spark/presidio_anonymize_blobs.py
+++ b/docs/samples/deployments/spark/presidio_anonymize_blobs.py
@@ -6,7 +6,7 @@
# MAGIC
# MAGIC
The following code sample will:
# MAGIC