Skip to content

Commit

Permalink
Merge pull request #8 from Azure-Samples/distinct-skill
Browse files Browse the repository at this point in the history
Add a distinct skill to de-duplicate a list of terms
  • Loading branch information
bleroy authored Sep 5, 2019
2 parents a5d3797 + d02a991 commit 1fb537c
Show file tree
Hide file tree
Showing 13 changed files with 485 additions and 2 deletions.
9 changes: 8 additions & 1 deletion PowerSkills.sln
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tokenizer", "Text\Tokenizer
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PowerSkillTests", "Tests\PowerSkillTests\PowerSkillTests.csproj", "{6F3E1DE3-7C77-49ED-BF9C-1FED8B318386}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Common", "Common\Common.csproj", "{30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Common", "Common\Common.csproj", "{30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Distinct", "Text\Distinct\Distinct.csproj", "{63C364D1-FE52-45EC-B23E-20720201844A}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down Expand Up @@ -103,6 +105,10 @@ Global
{30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}.Debug|Any CPU.Build.0 = Debug|Any CPU
{30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}.Release|Any CPU.ActiveCfg = Release|Any CPU
{30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}.Release|Any CPU.Build.0 = Release|Any CPU
{63C364D1-FE52-45EC-B23E-20720201844A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{63C364D1-FE52-45EC-B23E-20720201844A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{63C364D1-FE52-45EC-B23E-20720201844A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{63C364D1-FE52-45EC-B23E-20720201844A}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand All @@ -117,6 +123,7 @@ Global
{0D165CE1-66B6-47AE-ABA1-EF157C4E2884} = {10BE854F-F22A-4AE0-8283-688E10C76275}
{F0B21155-829F-4B26-853B-82ECAAB18D23} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203}
{54AFF776-801F-4328-9EEE-CFFCD20B5497} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203}
{63C364D1-FE52-45EC-B23E-20720201844A} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {3C22FE9E-7E11-429D-ADDE-0A220F285B90}
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ This project provides the following custom skills:
* [**AnalyzeForm**](Vision/AnalyzeForm/README.md): recognizes form fields in a document.
* [**CustomEntitySearch**](/Text/CustomEntitySearch): finds custom entity names in text.
* [**Tokenizer**](Text/Tokenizer/README.md): extracts non-stop words from a text.
* [**Distinct**](Text/Distinct/README.md): de-duplicates a list of terms.

## Getting Started

Expand Down
88 changes: 88 additions & 0 deletions Tests/PowerSkillTests/DistinctTests/DistinctTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT License. See LICENSE file in the project root for full license information.

using Microsoft.VisualStudio.TestTools.UnitTesting;
using AzureCognitiveSearch.PowerSkills.Text.Distinct;
using System.Collections.Generic;
using System.Linq;
using System;
using System.IO;

namespace AzureCognitiveSearch.PowerSkills.Tests.DistinctTests
{
[TestClass]
public class DistinctTests
{
[DataTestMethod]
[DataRow("Case insensitive díàcrïtiçs señsîtive AaBbCcÉéeÀàaÇçcÑñn", "caseinsensitivedíàcrïtiçsseñsîtiveaabbccééeààaççcññn")]
[DataRow("Spaces are removed", "spacesareremoved")]
[DataRow("Punctuation, as well; that is good...", "punctuationaswellthatisgood")]
public void NormalizeLowersCaseRemovesPunctuationAndSpaces(string word, string expectedNormalized)
=> Assert.AreEqual(expectedNormalized.Normalize(), Thesaurus.Normalize(word));

private static readonly string[][] _synonyms
= new[]
{
new[] { "A.C.R.O.N.Y.M", "acornym", "acronyms" },
new[] { "Microsoft", "Microsoft Corporation", "Microsoft corp.", "MSFT" }
};

[DataTestMethod]
[DataRow(new[] { "Acronym", "acornym", "a cro:n y.m "}, "A.C.R.O.N.Y.M")]
[DataRow(new[] { "miCrosoft", "microsoft Corp;", "M.S.F.T."}, "Microsoft")]
[DataRow(new[] { "Not found" }, "Not found")]
public void ThesaurusReturnsCanonicalFormOrWordIfNotFound(IEnumerable<string> words, string expectedCanonical)
{
foreach (string word in words)
{
Assert.AreEqual(expectedCanonical, new Thesaurus(_synonyms).Dedupe(new[] { word }).First());
}
}

[DataTestMethod]
[DataRow(
"It is true that many acronyms are used at Microsoft. MSFT is no different as it's just an acronym for Microsoft.",
"It is true that many A.C.R.O.N.Y.M are used at Microsoft no different as it's just an for")]
public void DeduplicateYieldsListOfDistinctCanonicalForms(string text, string expectedDeduplicated)
{
var expectedTerms = expectedDeduplicated.Split(' ').OrderBy(term => term);
var deduped = new Thesaurus(_synonyms).Dedupe(text.Split(' ')).OrderBy(term => term);

Assert.IsTrue(expectedTerms.SequenceEqual(deduped), $"Expected [{ string.Join(", ", expectedTerms) }] but was [{ string.Join(", ", deduped) }].");
}

[TestMethod]
public void ThesaurusBuildsNormalizedSynonymToCanonicalFormDictionaryAndIgnoresEmptyLemmas()
{
const string canonicalAcronym = "acronym";
const string canonicalMicrosoft = "Microsoft";
var synonyms = new Thesaurus(new[]
{
new[] { canonicalAcronym, "acornym", "acronyms" },
Array.Empty<string>(),
new[] { canonicalMicrosoft, "Microsoft Corporation", "Microsoft corp.", "MSFT" }
}).Synonyms;

Assert.AreEqual(7, synonyms.Count());
Assert.AreEqual(canonicalAcronym, synonyms["acronym"]);
Assert.AreEqual(canonicalAcronym, synonyms["acornym"]);
Assert.AreEqual(canonicalAcronym, synonyms["acronyms"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["microsoft"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["microsoftcorporation"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["microsoftcorp"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["msft"]);
}

[TestMethod]
public void ThesaurusConstructorThrowsForDuplicateLemmas()
{
Assert.ThrowsException<InvalidDataException>(() => {
_ = new Thesaurus(new[]
{
new[] {"foo", "bar"},
new[] {"baz", "bar"}
});
});
}
}
}
1 change: 1 addition & 0 deletions Tests/PowerSkillTests/PowerSkillTests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
<ProjectReference Include="..\..\Text\AcronymLinker\AcronymLinker.csproj" />
<ProjectReference Include="..\..\Text\BingEntitySearch\BingEntitySearch.csproj" />
<ProjectReference Include="..\..\Text\CustomEntitySearch\CustomEntitySearch.csproj" />
<ProjectReference Include="..\..\Text\Distinct\Distinct.csproj" />
<ProjectReference Include="..\..\Text\Tokenizer\Tokenizer.csproj" />
<ProjectReference Include="..\..\Vision\AnalyzeForm\AnalyzeForm.csproj" />
<ProjectReference Include="..\..\Vision\HocrGenerator\HocrGenerator.csproj" />
Expand Down
1 change: 0 additions & 1 deletion Text/AcronymLinker/LinkAcronyms.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
using System.Collections.Generic;
using AzureCognitiveSearch.PowerSkills.Common;
using System.Linq;
using System;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

Expand Down
59 changes: 59 additions & 0 deletions Text/Distinct/Distinct.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT License. See LICENSE file in the project root for full license information.

using System.Threading.Tasks;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Azure.WebJobs;
using Microsoft.Azure.WebJobs.Extensions.Http;
using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Logging;
using System.Collections.Generic;
using AzureCognitiveSearch.PowerSkills.Common;
using Newtonsoft.Json.Linq;
using System;

namespace AzureCognitiveSearch.PowerSkills.Text.Distinct
{
public static class Distinct
{
[FunctionName("distinct")]
public static async Task<IActionResult> RunDistinct(
[HttpTrigger(AuthorizationLevel.Function, "post", Route = null)] HttpRequest req,
ILogger log,
ExecutionContext executionContext)
{
log.LogInformation("Distinct Custom Skill: C# HTTP trigger function processed a request.");

string skillName = executionContext.FunctionName;
IEnumerable<WebApiRequestRecord> requestRecords = WebApiSkillHelpers.GetRequestRecords(req);
if (requestRecords == null)
{
return new BadRequestObjectResult($"{skillName} - Invalid request record array.");
}

try
{
Thesaurus thesaurus = new Thesaurus(executionContext.FunctionAppDirectory);
}
catch(Exception e)
{
throw new Exception("Failed to read and parse thesaurus.json.", e);
}
WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords,
(inRecord, outRecord) =>
{
JArray wordsParameter = inRecord.Data.TryGetValue("words", out object wordsParameterObject) ?
wordsParameterObject as JArray : null;
if (wordsParameter is null)
{
throw new ArgumentException("Input data is missing a `words` array of words to de-duplicate.", "words");
}
var words = wordsParameter.Values<string>();
outRecord.Data["distinct"] = thesaurus.Dedupe(words);
return outRecord;
});

return new OkObjectResult(response);
}
}
}
29 changes: 29 additions & 0 deletions Text/Distinct/Distinct.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>
<AzureFunctionsVersion>v2</AzureFunctionsVersion>
</PropertyGroup>
<ItemGroup>
<None Remove="thesaurus.json" />
</ItemGroup>
<ItemGroup>
<Content Include="thesaurus.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.NET.Sdk.Functions" Version="1.0.29" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\Common\Common.csproj" />
</ItemGroup>
<ItemGroup>
<None Update="host.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="local.settings.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<CopyToPublishDirectory>Never</CopyToPublishDirectory>
</None>
</ItemGroup>
</Project>
102 changes: 102 additions & 0 deletions Text/Distinct/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
---
page_type: sample
languages:
- csharp
products:
- azure
- azure-search
name: Distinct sample skill for cognitive search
description: This custom skill removes duplicates from a list of terms.
azureDeploy: https://raw.githubusercontent.com/Azure-Samples/azure-search-power-skills/master/Text/Distinct/azuredeploy.json
---

# Distinct

This custom skill removes duplicates from a list of terms.

Terms are considered the same if they only differ by casing, separators such as spaces, or punctuation, or if they have a common entry in the thesaurus.

[![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-search-power-skills%2Fmaster%2FText%2FDistinct%2Fazuredeploy.json)

## Requirements

This skill has no additional requirements than the ones described in [the root `README.md` file](../../README.md).

## Settings

This function uses a JSON file called [`thesaurus.json`](./thesaurus.json) that can be found at the root of this project, and that will be deployed with the function. This file contains a simple list of lists of synonyms. For each list of synonyms, the first is considered the canonical form. Please replace this file with your own data.

## link-acronyms

### Sample Input:

```json
{
"values": [
{
"recordId": "foobar2",
"data":
{
"words": [
"MSFT",
"U.S.A",
"word",
"United states",
"WOrD",
"Microsoft Corp."
]
}
}
]
}
```

### Sample Output:

```json
{
"values": [
{
"recordId": "foobar2",
"data": {
"distinct": {
"value": [
"Microsoft",
"USA",
"word"
]
}
},
"errors": [],
"warnings": []
}
]
}
```

## Sample Skillset Integration

In order to use this skill in a cognitive search pipeline, you'll need to add a skill definition to your skillset.
Here's a sample skill definition for this example (inputs and outputs should be updated to reflect your particular scenario and skillset environment):

```json
{
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
"description": "Distinct entities",
"uri": "[AzureFunctionEndpointUrl]/api/link-acronyms-list?code=[AzureFunctionDefaultHostKey]",
"batchSize": 1,
"context": "/document/merged_content",
"inputs": [
{
"name": "words",
"source": "/document/merged_content/organizations"
}
],
"outputs": [
{
"name": "distinct",
"targetName": "distinct_organizations"
}
]
}
```
Loading

0 comments on commit 1fb537c

Please sign in to comment.