Skip to content

Commit

Permalink
Merge pull request #3 from Azure-Samples/tokenizer-skill
Browse files Browse the repository at this point in the history
Add a tokenizer skill
  • Loading branch information
bleroy authored Aug 2, 2019
2 parents c5faee6 + de32362 commit d1e20d1
Show file tree
Hide file tree
Showing 9 changed files with 322 additions and 1 deletion.
7 changes: 7 additions & 0 deletions PowerSkills.sln
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tests", "Tests", "{F5F3F598
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CustomEntitySearchTest", "Tests\CustomEntitySearchTest\CustomEntitySearchTest.csproj", "{496441CA-21E9-46AE-B1D3-0F2AE2C89CD4}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tokenizer", "Text\Tokenizer\Tokenizer.csproj", "{54AFF776-801F-4328-9EEE-CFFCD20B5497}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -93,6 +95,10 @@ Global
{496441CA-21E9-46AE-B1D3-0F2AE2C89CD4}.Debug|Any CPU.Build.0 = Debug|Any CPU
{496441CA-21E9-46AE-B1D3-0F2AE2C89CD4}.Release|Any CPU.ActiveCfg = Release|Any CPU
{496441CA-21E9-46AE-B1D3-0F2AE2C89CD4}.Release|Any CPU.Build.0 = Release|Any CPU
{54AFF776-801F-4328-9EEE-CFFCD20B5497}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{54AFF776-801F-4328-9EEE-CFFCD20B5497}.Debug|Any CPU.Build.0 = Debug|Any CPU
{54AFF776-801F-4328-9EEE-CFFCD20B5497}.Release|Any CPU.ActiveCfg = Release|Any CPU
{54AFF776-801F-4328-9EEE-CFFCD20B5497}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand All @@ -107,6 +113,7 @@ Global
{0D165CE1-66B6-47AE-ABA1-EF157C4E2884} = {10BE854F-F22A-4AE0-8283-688E10C76275}
{F0B21155-829F-4B26-853B-82ECAAB18D23} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203}
{496441CA-21E9-46AE-B1D3-0F2AE2C89CD4} = {F5F3F598-2DCA-42EA-9797-D3A3E7CFA2CB}
{54AFF776-801F-4328-9EEE-CFFCD20B5497} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {3C22FE9E-7E11-429D-ADDE-0A220F285B90}
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ This project provides the following custom skills:
* [**HocrGenerator**](Vision/HocrGenerator/README.md): transforms the result of OCR into the hOCR format.
* [**AnalyzeForm**](Vision/AnalyzeForm/README.md): recognizes form fields in a document.
* [**CustomEntitySearch**](/Text/CustomEntitySearch): finds custom entity names in text.
* [**Tokenizer**](Text/Tokenizer/README.md): extracts non-stop words from a text.

## Getting Started

Expand Down
2 changes: 1 addition & 1 deletion Template/HelloWorld/HelloWorld.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace AzureCognitiveSearch.PowerSkills.Template.HelloWorld
{
public static class Function1
public static class HelloWorld
{
[FunctionName("hello-world")]
public static async Task<IActionResult> RunHelloWorld(
Expand Down
99 changes: 99 additions & 0 deletions Text/Tokenizer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
---
page_type: sample
languages:
- csharp
products:
- azure
- azure-cognitive-services
name: Tokenizer sample skill for cognitive search
description: This custom skill extracts normalized non-stop words from a text using the ML.NET library.
azureDeploy: https://raw.githubusercontent.com/Azure-Samples/azure-search-power-skills/master/Text/Tokenizer/azuredeploy.json
---

# Tokenizer

This custom skill extracts normalized non-stop words from a text using [the ML.NET library](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml?view=ml-dotnet).

## Requirements

This skills have no additional requirements than the ones described in [the root `README.md` file](../../README.md).

## Deployment

[![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-search-power-skills%2Fmaster%2FText%2FTokenizer%2Fazuredeploy.json)

## Settings

The language used for stop word removal can be changed directly in the code.

## tokenizer

### Sample Input:

```json
{
"values": [
{
"recordId": "record1",
"data": {
"text": "ML.NET's RemoveDefaultStopWords API removes stop words from tHe text/string. It requires the text/string to be tokenized beforehand."
}
}
]
}
```

### Sample Output:

```json
{
"values": [
{
"recordId": "record1",
"data": {
"words": [
"mlnets",
"removedefaultstopwords",
"api",
"removes",
"stop",
"words",
"textstring",
"requires",
"textstring",
"tokenized"
]
},
"errors": [],
"warnings": []
}
]
}
```

## Sample Skillset Integration

In order to use this skill in a cognitive search pipeline, you'll need to add a skill definition to your skillset.
Here's a sample skill definition for this example (inputs and outputs should be updated to reflect your particular scenario and skillset environment):

```json
{
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
"description": "Tokenizer",
"uri": "[AzureFunctionEndpointUrl]/api/tokenizer?code=[AzureFunctionDefaultHostKey]",
"batchSize": 1,
"context": "/document/content",
"inputs": [
{
"name": "text",
"source": "/document/content"
}
],
"outputs": [
{
"name": "words",
"targetName": "words"
}
]
}
```
63 changes: 63 additions & 0 deletions Text/Tokenizer/Tokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
using System.Threading.Tasks;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Azure.WebJobs;
using Microsoft.Azure.WebJobs.Extensions.Http;
using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Logging;
using AzureCognitiveSearch.PowerSkills.Common;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Transforms.Text;
using System;
using Microsoft.ML.Data;

namespace Tokenizer
{
public static class Tokenizer
{
[FunctionName("tokenizer")]
public static IActionResult RunTokenizer(
[HttpTrigger(AuthorizationLevel.Function, "post", Route = null)] HttpRequest req,
ILogger log,
ExecutionContext executionContext)
{
log.LogInformation("Tokenizer Custom Skill: C# HTTP trigger function processed a request.");

string skillName = executionContext.FunctionName;
IEnumerable<WebApiRequestRecord> requestRecords = WebApiSkillHelpers.GetRequestRecords(req);
if (requestRecords == null)
{
return new BadRequestObjectResult($"{skillName} - Invalid request record array.");
}

var mlContext = new MLContext();
IDataView emptyDataView = mlContext.Data.LoadFromEnumerable(new List<TextData>());
EstimatorChain<StopWordsRemovingTransformer> textPipeline = mlContext.Transforms.Text
.NormalizeText("Text", caseMode: TextNormalizingEstimator.CaseMode.Lower, keepDiacritics: true, keepPunctuations: false, keepNumbers: false)
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' }))
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Words", language: StopWordsRemovingEstimator.Language.English));
TransformerChain<StopWordsRemovingTransformer> textTransformer = textPipeline.Fit(emptyDataView);
PredictionEngine<TextData, TransformedTextData> predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);

WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords,
(inRecord, outRecord) =>
{
var text = new TextData { Text = inRecord.Data["text"] as string };
outRecord.Data["words"] = predictionEngine.Predict(text).Words;
return outRecord;
});

return new OkObjectResult(response);
}

private class TextData
{
public string Text { get; set; }
}

private class TransformedTextData
{
public string[] Words { get; set; }
}
}
}
23 changes: 23 additions & 0 deletions Text/Tokenizer/Tokenizer.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>
<AzureFunctionsVersion>v2</AzureFunctionsVersion>
<RootNamespace>AzureCognitiveSearch.PowerSkills.Text.Tokenizer</RootNamespace>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.ML" Version="1.2.0" />
<PackageReference Include="Microsoft.NET.Sdk.Functions" Version="1.0.28" />
</ItemGroup>
<ItemGroup>
<None Update="host.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="local.settings.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<CopyToPublishDirectory>Never</CopyToPublishDirectory>
</None>
</ItemGroup>
<ItemGroup>
<Compile Include="../../*.cs" />
</ItemGroup>
</Project>
118 changes: 118 additions & 0 deletions Text/Tokenizer/azuredeploy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourcePrefix": {
"type": "string",
"defaultValue": "psbes",
"metadata": {
"description": "Prefix for all resources created by this template"
}
},
"storageAccountType": {
"type": "string",
"defaultValue": "Standard_LRS",
"allowedValues": [
"Standard_LRS",
"Standard_GRS",
"Standard_ZRS",
"Premium_LRS"
],
"metadata": {
"description": "Storage Account type"
}
}
},
"variables": {
"repoURL": "https://github.com/Azure-Samples/azure-search-power-skills",
"functionProject": "Text\\Tokenizer\\Tokenizer.csproj",
"websiteName": "[toLower(concat(parameters('resourcePrefix'), '-site-', uniqueString(resourceGroup().id)))]",
"storageAccount": "[toLower(concat(parameters('resourcePrefix'), uniqueString(resourceGroup().id)))]",
"functionAppName": "[toLower(concat(parameters('resourcePrefix'), '-function-app-', uniqueString(resourceGroup().id)))]"
},
"resources": [
{
"apiVersion": "2017-10-01",
"name": "[variables('storageAccount')]",
"type": "Microsoft.Storage/storageAccounts",
"location": "[resourceGroup().location]",
"sku": {
"name": "[parameters('storageAccountType')]"
},
"kind": "Storage"
},
{
"apiVersion": "2016-08-01",
"name": "[variables('functionAppName')]",
"type": "Microsoft.Web/sites",
"kind": "functionapp",
"location": "[resourceGroup().location]",
"dependsOn": [
"[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount'))]"
],
"properties": {
"name": "[variables('functionAppName')]",
"kind": "functionapp",
"httpsOnly": true,
"siteConfig": {
"appSettings": [
{
"name": "WEBSITE_CONTENTAZUREFILECONNECTIONSTRING",
"value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]"
},
{
"name": "WEBSITE_CONTENTSHARE",
"value": "[toLower(variables('functionAppName'))]"
},
{
"name": "AzureWebJobsDashboard",
"value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]"
},
{
"name": "AzureWebJobsStorage",
"value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]"
},
{
"name": "FUNCTIONS_EXTENSION_VERSION",
"value": "~2"
},
{
"name": "FUNCTIONS_EXTENSION_RUNTIME",
"value": "dotnet"
},
{
"name": "SCM_DO_BUILD_DURING_DEPLOYMENT",
"value": true
},
{
"name": "PROJECT",
"value": "[variables('functionProject')]"
}
]
}
},
"resources": [
{
"apiVersion": "2015-08-01",
"name": "web",
"type": "sourcecontrols",
"dependsOn": [
"[resourceId('Microsoft.Web/Sites', variables('functionAppName'))]"
],
"properties": {
"RepoUrl": "[variables('repoURL')]",
"branch": "master",
"project": "[variables('functionProject')]",
"IsManualIntegration": true
}
}
]
}
],
"outputs": {
"Azure Function Site Name": {
"type": "string",
"value": "[variables('functionAppName')]"
}
}
}
3 changes: 3 additions & 0 deletions Text/Tokenizer/host.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"version": "2.0"
}
7 changes: 7 additions & 0 deletions Text/Tokenizer/local.settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"IsEncrypted": false,
"Values": {
"AzureWebJobsStorage": "",
"FUNCTIONS_WORKER_RUNTIME": "dotnet"
}
}

0 comments on commit d1e20d1

Please sign in to comment.