-
Notifications
You must be signed in to change notification settings - Fork 170
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from Azure-Samples/tokenizer-skill
Add a tokenizer skill
- Loading branch information
Showing
9 changed files
with
322 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
--- | ||
page_type: sample | ||
languages: | ||
- csharp | ||
products: | ||
- azure | ||
- azure-cognitive-services | ||
name: Tokenizer sample skill for cognitive search | ||
description: This custom skill extracts normalized non-stop words from a text using the ML.NET library. | ||
azureDeploy: https://raw.githubusercontent.com/Azure-Samples/azure-search-power-skills/master/Text/Tokenizer/azuredeploy.json | ||
--- | ||
|
||
# Tokenizer | ||
|
||
This custom skill extracts normalized non-stop words from a text using [the ML.NET library](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml?view=ml-dotnet). | ||
|
||
## Requirements | ||
|
||
This skills have no additional requirements than the ones described in [the root `README.md` file](../../README.md). | ||
|
||
## Deployment | ||
|
||
[![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-search-power-skills%2Fmaster%2FText%2FTokenizer%2Fazuredeploy.json) | ||
|
||
## Settings | ||
|
||
The language used for stop word removal can be changed directly in the code. | ||
|
||
## tokenizer | ||
|
||
### Sample Input: | ||
|
||
```json | ||
{ | ||
"values": [ | ||
{ | ||
"recordId": "record1", | ||
"data": { | ||
"text": "ML.NET's RemoveDefaultStopWords API removes stop words from tHe text/string. It requires the text/string to be tokenized beforehand." | ||
} | ||
} | ||
] | ||
} | ||
``` | ||
|
||
### Sample Output: | ||
|
||
```json | ||
{ | ||
"values": [ | ||
{ | ||
"recordId": "record1", | ||
"data": { | ||
"words": [ | ||
"mlnets", | ||
"removedefaultstopwords", | ||
"api", | ||
"removes", | ||
"stop", | ||
"words", | ||
"textstring", | ||
"requires", | ||
"textstring", | ||
"tokenized" | ||
] | ||
}, | ||
"errors": [], | ||
"warnings": [] | ||
} | ||
] | ||
} | ||
``` | ||
|
||
## Sample Skillset Integration | ||
|
||
In order to use this skill in a cognitive search pipeline, you'll need to add a skill definition to your skillset. | ||
Here's a sample skill definition for this example (inputs and outputs should be updated to reflect your particular scenario and skillset environment): | ||
|
||
```json | ||
{ | ||
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill", | ||
"description": "Tokenizer", | ||
"uri": "[AzureFunctionEndpointUrl]/api/tokenizer?code=[AzureFunctionDefaultHostKey]", | ||
"batchSize": 1, | ||
"context": "/document/content", | ||
"inputs": [ | ||
{ | ||
"name": "text", | ||
"source": "/document/content" | ||
} | ||
], | ||
"outputs": [ | ||
{ | ||
"name": "words", | ||
"targetName": "words" | ||
} | ||
] | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
using System.Threading.Tasks; | ||
using Microsoft.AspNetCore.Mvc; | ||
using Microsoft.Azure.WebJobs; | ||
using Microsoft.Azure.WebJobs.Extensions.Http; | ||
using Microsoft.AspNetCore.Http; | ||
using Microsoft.Extensions.Logging; | ||
using AzureCognitiveSearch.PowerSkills.Common; | ||
using System.Collections.Generic; | ||
using Microsoft.ML; | ||
using Microsoft.ML.Transforms.Text; | ||
using System; | ||
using Microsoft.ML.Data; | ||
|
||
namespace Tokenizer | ||
{ | ||
public static class Tokenizer | ||
{ | ||
[FunctionName("tokenizer")] | ||
public static IActionResult RunTokenizer( | ||
[HttpTrigger(AuthorizationLevel.Function, "post", Route = null)] HttpRequest req, | ||
ILogger log, | ||
ExecutionContext executionContext) | ||
{ | ||
log.LogInformation("Tokenizer Custom Skill: C# HTTP trigger function processed a request."); | ||
|
||
string skillName = executionContext.FunctionName; | ||
IEnumerable<WebApiRequestRecord> requestRecords = WebApiSkillHelpers.GetRequestRecords(req); | ||
if (requestRecords == null) | ||
{ | ||
return new BadRequestObjectResult($"{skillName} - Invalid request record array."); | ||
} | ||
|
||
var mlContext = new MLContext(); | ||
IDataView emptyDataView = mlContext.Data.LoadFromEnumerable(new List<TextData>()); | ||
EstimatorChain<StopWordsRemovingTransformer> textPipeline = mlContext.Transforms.Text | ||
.NormalizeText("Text", caseMode: TextNormalizingEstimator.CaseMode.Lower, keepDiacritics: true, keepPunctuations: false, keepNumbers: false) | ||
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' })) | ||
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Words", language: StopWordsRemovingEstimator.Language.English)); | ||
TransformerChain<StopWordsRemovingTransformer> textTransformer = textPipeline.Fit(emptyDataView); | ||
PredictionEngine<TextData, TransformedTextData> predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
||
WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords, | ||
(inRecord, outRecord) => | ||
{ | ||
var text = new TextData { Text = inRecord.Data["text"] as string }; | ||
outRecord.Data["words"] = predictionEngine.Predict(text).Words; | ||
return outRecord; | ||
}); | ||
|
||
return new OkObjectResult(response); | ||
} | ||
|
||
private class TextData | ||
{ | ||
public string Text { get; set; } | ||
} | ||
|
||
private class TransformedTextData | ||
{ | ||
public string[] Words { get; set; } | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
<PropertyGroup> | ||
<TargetFramework>netcoreapp2.1</TargetFramework> | ||
<AzureFunctionsVersion>v2</AzureFunctionsVersion> | ||
<RootNamespace>AzureCognitiveSearch.PowerSkills.Text.Tokenizer</RootNamespace> | ||
</PropertyGroup> | ||
<ItemGroup> | ||
<PackageReference Include="Microsoft.ML" Version="1.2.0" /> | ||
<PackageReference Include="Microsoft.NET.Sdk.Functions" Version="1.0.28" /> | ||
</ItemGroup> | ||
<ItemGroup> | ||
<None Update="host.json"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
</None> | ||
<None Update="local.settings.json"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
<CopyToPublishDirectory>Never</CopyToPublishDirectory> | ||
</None> | ||
</ItemGroup> | ||
<ItemGroup> | ||
<Compile Include="../../*.cs" /> | ||
</ItemGroup> | ||
</Project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
{ | ||
"$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", | ||
"contentVersion": "1.0.0.0", | ||
"parameters": { | ||
"resourcePrefix": { | ||
"type": "string", | ||
"defaultValue": "psbes", | ||
"metadata": { | ||
"description": "Prefix for all resources created by this template" | ||
} | ||
}, | ||
"storageAccountType": { | ||
"type": "string", | ||
"defaultValue": "Standard_LRS", | ||
"allowedValues": [ | ||
"Standard_LRS", | ||
"Standard_GRS", | ||
"Standard_ZRS", | ||
"Premium_LRS" | ||
], | ||
"metadata": { | ||
"description": "Storage Account type" | ||
} | ||
} | ||
}, | ||
"variables": { | ||
"repoURL": "https://github.com/Azure-Samples/azure-search-power-skills", | ||
"functionProject": "Text\\Tokenizer\\Tokenizer.csproj", | ||
"websiteName": "[toLower(concat(parameters('resourcePrefix'), '-site-', uniqueString(resourceGroup().id)))]", | ||
"storageAccount": "[toLower(concat(parameters('resourcePrefix'), uniqueString(resourceGroup().id)))]", | ||
"functionAppName": "[toLower(concat(parameters('resourcePrefix'), '-function-app-', uniqueString(resourceGroup().id)))]" | ||
}, | ||
"resources": [ | ||
{ | ||
"apiVersion": "2017-10-01", | ||
"name": "[variables('storageAccount')]", | ||
"type": "Microsoft.Storage/storageAccounts", | ||
"location": "[resourceGroup().location]", | ||
"sku": { | ||
"name": "[parameters('storageAccountType')]" | ||
}, | ||
"kind": "Storage" | ||
}, | ||
{ | ||
"apiVersion": "2016-08-01", | ||
"name": "[variables('functionAppName')]", | ||
"type": "Microsoft.Web/sites", | ||
"kind": "functionapp", | ||
"location": "[resourceGroup().location]", | ||
"dependsOn": [ | ||
"[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount'))]" | ||
], | ||
"properties": { | ||
"name": "[variables('functionAppName')]", | ||
"kind": "functionapp", | ||
"httpsOnly": true, | ||
"siteConfig": { | ||
"appSettings": [ | ||
{ | ||
"name": "WEBSITE_CONTENTAZUREFILECONNECTIONSTRING", | ||
"value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]" | ||
}, | ||
{ | ||
"name": "WEBSITE_CONTENTSHARE", | ||
"value": "[toLower(variables('functionAppName'))]" | ||
}, | ||
{ | ||
"name": "AzureWebJobsDashboard", | ||
"value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]" | ||
}, | ||
{ | ||
"name": "AzureWebJobsStorage", | ||
"value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]" | ||
}, | ||
{ | ||
"name": "FUNCTIONS_EXTENSION_VERSION", | ||
"value": "~2" | ||
}, | ||
{ | ||
"name": "FUNCTIONS_EXTENSION_RUNTIME", | ||
"value": "dotnet" | ||
}, | ||
{ | ||
"name": "SCM_DO_BUILD_DURING_DEPLOYMENT", | ||
"value": true | ||
}, | ||
{ | ||
"name": "PROJECT", | ||
"value": "[variables('functionProject')]" | ||
} | ||
] | ||
} | ||
}, | ||
"resources": [ | ||
{ | ||
"apiVersion": "2015-08-01", | ||
"name": "web", | ||
"type": "sourcecontrols", | ||
"dependsOn": [ | ||
"[resourceId('Microsoft.Web/Sites', variables('functionAppName'))]" | ||
], | ||
"properties": { | ||
"RepoUrl": "[variables('repoURL')]", | ||
"branch": "master", | ||
"project": "[variables('functionProject')]", | ||
"IsManualIntegration": true | ||
} | ||
} | ||
] | ||
} | ||
], | ||
"outputs": { | ||
"Azure Function Site Name": { | ||
"type": "string", | ||
"value": "[variables('functionAppName')]" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"version": "2.0" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"IsEncrypted": false, | ||
"Values": { | ||
"AzureWebJobsStorage": "", | ||
"FUNCTIONS_WORKER_RUNTIME": "dotnet" | ||
} | ||
} |