From 32678064600f98f4e5b44c73aa4e9eed25d4d44e Mon Sep 17 00:00:00 2001 From: Bertrand Le Roy Date: Tue, 3 Sep 2019 15:46:37 -0700 Subject: [PATCH 1/3] Add a distinct skill to de-duplicate a list of terms --- PowerSkills.sln | 9 +- README.md | 1 + .../DistinctTests/DistinctTests.cs | 52 ++++++++ Tests/PowerSkillTests/PowerSkillTests.csproj | 1 + Text/AcronymLinker/LinkAcronyms.cs | 1 - Text/Distinct/Distinct.cs | 45 +++++++ Text/Distinct/Distinct.csproj | 29 +++++ Text/Distinct/README.md | 100 +++++++++++++++ Text/Distinct/Thesaurus.cs | 61 +++++++++ Text/Distinct/azuredeploy.json | 118 ++++++++++++++++++ Text/Distinct/host.json | 3 + Text/Distinct/local.settings.json | 7 ++ Text/Distinct/thesaurus.json | 4 + 13 files changed, 429 insertions(+), 2 deletions(-) create mode 100644 Tests/PowerSkillTests/DistinctTests/DistinctTests.cs create mode 100644 Text/Distinct/Distinct.cs create mode 100644 Text/Distinct/Distinct.csproj create mode 100644 Text/Distinct/README.md create mode 100644 Text/Distinct/Thesaurus.cs create mode 100644 Text/Distinct/azuredeploy.json create mode 100644 Text/Distinct/host.json create mode 100644 Text/Distinct/local.settings.json create mode 100644 Text/Distinct/thesaurus.json diff --git a/PowerSkills.sln b/PowerSkills.sln index 3a437905..0548d2d5 100644 --- a/PowerSkills.sln +++ b/PowerSkills.sln @@ -51,7 +51,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tokenizer", "Text\Tokenizer EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PowerSkillTests", "Tests\PowerSkillTests\PowerSkillTests.csproj", "{6F3E1DE3-7C77-49ED-BF9C-1FED8B318386}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Common", "Common\Common.csproj", "{30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Common", "Common\Common.csproj", "{30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Distinct", "Text\Distinct\Distinct.csproj", "{63C364D1-FE52-45EC-B23E-20720201844A}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -103,6 +105,10 @@ Global {30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}.Debug|Any CPU.Build.0 = Debug|Any CPU {30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}.Release|Any CPU.ActiveCfg = Release|Any CPU {30F1ECE3-0B5D-4ACF-A375-B7C3721AF654}.Release|Any CPU.Build.0 = Release|Any CPU + {63C364D1-FE52-45EC-B23E-20720201844A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {63C364D1-FE52-45EC-B23E-20720201844A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {63C364D1-FE52-45EC-B23E-20720201844A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {63C364D1-FE52-45EC-B23E-20720201844A}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -117,6 +123,7 @@ Global {0D165CE1-66B6-47AE-ABA1-EF157C4E2884} = {10BE854F-F22A-4AE0-8283-688E10C76275} {F0B21155-829F-4B26-853B-82ECAAB18D23} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203} {54AFF776-801F-4328-9EEE-CFFCD20B5497} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203} + {63C364D1-FE52-45EC-B23E-20720201844A} = {F98E15F9-D6BB-41CE-8B5B-84E624B86203} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {3C22FE9E-7E11-429D-ADDE-0A220F285B90} diff --git a/README.md b/README.md index 84680d22..f1700f88 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ This project provides the following custom skills: * [**AnalyzeForm**](Vision/AnalyzeForm/README.md): recognizes form fields in a document. * [**CustomEntitySearch**](/Text/CustomEntitySearch): finds custom entity names in text. * [**Tokenizer**](Text/Tokenizer/README.md): extracts non-stop words from a text. +* [**Distinct**](Text/Distinct/README.md): de-duplicates a list of terms. ## Getting Started diff --git a/Tests/PowerSkillTests/DistinctTests/DistinctTests.cs b/Tests/PowerSkillTests/DistinctTests/DistinctTests.cs new file mode 100644 index 00000000..8b24beb9 --- /dev/null +++ b/Tests/PowerSkillTests/DistinctTests/DistinctTests.cs @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for full license information. + +using Microsoft.VisualStudio.TestTools.UnitTesting; +using AzureCognitiveSearch.PowerSkills.Text.Distinct; +using System.Collections.Generic; +using System.Linq; + +namespace AzureCognitiveSearch.PowerSkills.Tests.DistinctTests +{ + [TestClass] + public class DistinctTests + { + [DataTestMethod] + [DataRow("Case insensitive díàcrïtiçs señsîtive AaBbCcÉéeÀàaÇçcÑñn", "caseinsensitivedíàcrïtiçsseñsîtiveaabbccééeààaççcññn")] + [DataRow("Spaces are removed", "spacesareremoved")] + [DataRow("Punctuation, as well; that is good...", "punctuationaswellthatisgood")] + public void NormalizeLowersCaseRemovesPunctuationAndSpaces(string word, string expectedNormalized) + => Assert.AreEqual(expectedNormalized.Normalize(), Thesaurus.Normalize(word)); + + private static readonly string[][] _synonyms + = new[] + { + new[] { "A.C.R.O.N.Y.M", "acornym", "acronyms" }, + new[] { "Microsoft", "Microsoft Corporation", "Microsoft corp.", "MSFT" } + }; + + [DataTestMethod] + [DataRow(new[] { "Acronym", "acornym", "a cro:n y.m "}, "A.C.R.O.N.Y.M")] + [DataRow(new[] { "miCrosoft", "microsoft Corp;", "M.S.F.T."}, "Microsoft")] + [DataRow(new[] { "Not found" }, "Not found")] + public void ThesaurusReturnsCanonicalFormOrWordIfNotFound(IEnumerable words, string expectedCanonical) + { + foreach (string word in words) + { + Assert.AreEqual(expectedCanonical, new Thesaurus(_synonyms).Dedupe(new[] { word }).First()); + } + } + + [DataTestMethod] + [DataRow( + "It is true that many acronyms are used at Microsoft. MSFT is no different as it's just an acronym for Microsoft.", + "It is true that many A.C.R.O.N.Y.M are used at Microsoft no different as it's just an for")] + public void DeduplicateYieldsListOfDistinctCanonicalForms(string text, string expectedDeduplicated) + { + var expectedTerms = expectedDeduplicated.Split(' ').OrderBy(term => term); + var deduped = new Thesaurus(_synonyms).Dedupe(text.Split(' ')).OrderBy(term => term); + + Assert.IsTrue(expectedTerms.SequenceEqual(deduped), $"Expected [{ string.Join(", ", expectedTerms) }] but was [{ string.Join(", ", deduped) }]."); + } + } +} diff --git a/Tests/PowerSkillTests/PowerSkillTests.csproj b/Tests/PowerSkillTests/PowerSkillTests.csproj index 96a86e41..2eb1b659 100644 --- a/Tests/PowerSkillTests/PowerSkillTests.csproj +++ b/Tests/PowerSkillTests/PowerSkillTests.csproj @@ -23,6 +23,7 @@ + diff --git a/Text/AcronymLinker/LinkAcronyms.cs b/Text/AcronymLinker/LinkAcronyms.cs index 15e198dd..b3597d78 100644 --- a/Text/AcronymLinker/LinkAcronyms.cs +++ b/Text/AcronymLinker/LinkAcronyms.cs @@ -9,7 +9,6 @@ using System.Collections.Generic; using AzureCognitiveSearch.PowerSkills.Common; using System.Linq; -using System; using Newtonsoft.Json; using Newtonsoft.Json.Linq; diff --git a/Text/Distinct/Distinct.cs b/Text/Distinct/Distinct.cs new file mode 100644 index 00000000..a0a38466 --- /dev/null +++ b/Text/Distinct/Distinct.cs @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for full license information. + +using System.Threading.Tasks; +using Microsoft.AspNetCore.Mvc; +using Microsoft.Azure.WebJobs; +using Microsoft.Azure.WebJobs.Extensions.Http; +using Microsoft.AspNetCore.Http; +using Microsoft.Extensions.Logging; +using System.Collections.Generic; +using AzureCognitiveSearch.PowerSkills.Common; +using Newtonsoft.Json.Linq; + +namespace AzureCognitiveSearch.PowerSkills.Text.Distinct +{ + public static class Distinct + { + [FunctionName("distinct")] + public static async Task RunDistinct( + [HttpTrigger(AuthorizationLevel.Function, "post", Route = null)] HttpRequest req, + ILogger log, + ExecutionContext executionContext) + { + log.LogInformation("Distinct Custom Skill: C# HTTP trigger function processed a request."); + + string skillName = executionContext.FunctionName; + IEnumerable requestRecords = WebApiSkillHelpers.GetRequestRecords(req); + if (requestRecords == null) + { + return new BadRequestObjectResult($"{skillName} - Invalid request record array."); + } + + Thesaurus thesaurus = new Thesaurus(executionContext.FunctionAppDirectory); + WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords, + (inRecord, outRecord) => + { + var words = ((JArray)inRecord.Data["words"]).Values(); + outRecord.Data["distinct"] = thesaurus.Dedupe(words); + return outRecord; + }); + + return new OkObjectResult(response); + } + } +} diff --git a/Text/Distinct/Distinct.csproj b/Text/Distinct/Distinct.csproj new file mode 100644 index 00000000..f96878cd --- /dev/null +++ b/Text/Distinct/Distinct.csproj @@ -0,0 +1,29 @@ + + + netcoreapp2.1 + v2 + + + + + + + PreserveNewest + + + + + + + + + + + PreserveNewest + + + PreserveNewest + Never + + + \ No newline at end of file diff --git a/Text/Distinct/README.md b/Text/Distinct/README.md new file mode 100644 index 00000000..be416ec3 --- /dev/null +++ b/Text/Distinct/README.md @@ -0,0 +1,100 @@ +--- +page_type: sample +languages: +- csharp +products: +- azure +- azure-search +name: Distinct sample skill for cognitive search +description: This custom skill removes duplicates from a list of terms. +azureDeploy: https://raw.githubusercontent.com/Azure-Samples/azure-search-power-skills/master/Text/Distinct/azuredeploy.json +--- + +# Distinct + +This custom skill removes duplicates from a list of terms. + +[![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-search-power-skills%2Fmaster%2FText%2FDistinct%2Fazuredeploy.json) + +## Requirements + +This skill has no additional requirements than the ones described in [the root `README.md` file](../../README.md). + +## Settings + +This function uses a JSON file called [`thesaurus.json`](./thesaurus.json) that can be found at the root of this project, and that will be deployed with the function. This file contains a simple list of lists of synonyms. For each list of synonyms, the first is considered the canonical form. Please replace this file with your own data. + +## link-acronyms + +### Sample Input: + +```json +{ + "values": [ + { + "recordId": "foobar2", + "data": + { + "words": [ + "MSFT", + "U.S.A", + "word", + "United states", + "WOrD", + "Microsoft Corp." + ] + } + } + ] +} +``` + +### Sample Output: + +```json +{ + "values": [ + { + "recordId": "foobar2", + "data": { + "distinct": { + "value": [ + "Microsoft", + "USA", + "word" + ] + } + }, + "errors": [], + "warnings": [] + } + ] +} +``` + +## Sample Skillset Integration + +In order to use this skill in a cognitive search pipeline, you'll need to add a skill definition to your skillset. +Here's a sample skill definition for this example (inputs and outputs should be updated to reflect your particular scenario and skillset environment): + +```json +{ + "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill", + "description": "Distinct entities", + "uri": "[AzureFunctionEndpointUrl]/api/link-acronyms-list?code=[AzureFunctionDefaultHostKey]", + "batchSize": 1, + "context": "/document/merged_content", + "inputs": [ + { + "name": "words", + "source": "/document/merged_content/organizations" + } + ], + "outputs": [ + { + "name": "distinct", + "targetName": "distinct_organizations" + } + ] +} +``` diff --git a/Text/Distinct/Thesaurus.cs b/Text/Distinct/Thesaurus.cs new file mode 100644 index 00000000..2d7dd3e1 --- /dev/null +++ b/Text/Distinct/Thesaurus.cs @@ -0,0 +1,61 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for full license information. + +using Newtonsoft.Json; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace AzureCognitiveSearch.PowerSkills.Text.Distinct +{ + public class Thesaurus + { + public Thesaurus(string executingDirectoryPath) + : this(JsonConvert.DeserializeObject>>( + File.ReadAllText($"{executingDirectoryPath}\\thesaurus.json"))) + { } + + public Thesaurus(IEnumerable> dataset) + { + Synonyms = new Dictionary(StringComparer.InvariantCultureIgnoreCase); + foreach (IEnumerable lemma in dataset) + { + string canonicalForm = lemma.First(); + foreach (string form in lemma) + { + Synonyms.Add(Normalize(form), canonicalForm); + } + } + } + + public Dictionary Synonyms + { + get; private set; + } + + public IEnumerable Dedupe(IEnumerable words) + { + var normalizedToWord = new Dictionary(); + foreach (string word in words) + { + string normalized = Normalize(word); + string canonical = Synonyms.TryGetValue(normalized, out string canonicalFromThesaurus) ? + canonicalFromThesaurus : + normalized; + if (!normalizedToWord.ContainsKey(canonical)) + { + normalizedToWord.Add(canonical, canonicalFromThesaurus ?? word); // Arbitrarily consider the first occurrence as canonical + } + } + return normalizedToWord.Values.Distinct(); + } + + public static string Normalize(string word) + => new string(word + .Normalize() + .ToLower() + .Where(c => !(char.IsPunctuation(c) || char.IsSeparator(c))) + .ToArray()); + } +} diff --git a/Text/Distinct/azuredeploy.json b/Text/Distinct/azuredeploy.json new file mode 100644 index 00000000..ddf46392 --- /dev/null +++ b/Text/Distinct/azuredeploy.json @@ -0,0 +1,118 @@ +{ + "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "resourcePrefix": { + "type": "string", + "defaultValue": "psd", + "metadata": { + "description": "Prefix for all resources created by this template" + } + }, + "storageAccountType": { + "type": "string", + "defaultValue": "Standard_LRS", + "allowedValues": [ + "Standard_LRS", + "Standard_GRS", + "Standard_ZRS", + "Premium_LRS" + ], + "metadata": { + "description": "Storage Account type" + } + } + }, + "variables": { + "repoURL": "https://github.com/Azure-Samples/azure-search-power-skills", + "functionProject": "Text\\Distinct\\Distinct.csproj", + "websiteName": "[toLower(concat(parameters('resourcePrefix'), '-site-', uniqueString(resourceGroup().id)))]", + "storageAccount": "[toLower(concat(parameters('resourcePrefix'), uniqueString(resourceGroup().id)))]", + "functionAppName": "[toLower(concat(parameters('resourcePrefix'), '-function-app-', uniqueString(resourceGroup().id)))]" + }, + "resources": [ + { + "apiVersion": "2017-10-01", + "name": "[variables('storageAccount')]", + "type": "Microsoft.Storage/storageAccounts", + "location": "[resourceGroup().location]", + "sku": { + "name": "[parameters('storageAccountType')]" + }, + "kind": "Storage" + }, + { + "apiVersion": "2016-08-01", + "name": "[variables('functionAppName')]", + "type": "Microsoft.Web/sites", + "kind": "functionapp", + "location": "[resourceGroup().location]", + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount'))]" + ], + "properties": { + "name": "[variables('functionAppName')]", + "kind": "functionapp", + "httpsOnly": true, + "siteConfig": { + "appSettings": [ + { + "name": "WEBSITE_CONTENTAZUREFILECONNECTIONSTRING", + "value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]" + }, + { + "name": "WEBSITE_CONTENTSHARE", + "value": "[toLower(variables('functionAppName'))]" + }, + { + "name": "AzureWebJobsDashboard", + "value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]" + }, + { + "name": "AzureWebJobsStorage", + "value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccount'), ';AccountKey=', listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccount')), '2015-06-15').key1)]" + }, + { + "name": "FUNCTIONS_EXTENSION_VERSION", + "value": "~2" + }, + { + "name": "FUNCTIONS_EXTENSION_RUNTIME", + "value": "dotnet" + }, + { + "name": "SCM_DO_BUILD_DURING_DEPLOYMENT", + "value": true + }, + { + "name": "PROJECT", + "value": "[variables('functionProject')]" + } + ] + } + }, + "resources": [ + { + "apiVersion": "2015-08-01", + "name": "web", + "type": "sourcecontrols", + "dependsOn": [ + "[resourceId('Microsoft.Web/Sites', variables('functionAppName'))]" + ], + "properties": { + "RepoUrl": "[variables('repoURL')]", + "branch": "master", + "project": "[variables('functionProject')]", + "IsManualIntegration": true + } + } + ] + } + ], + "outputs": { + "Azure Function Site Name": { + "type": "string", + "value": "[variables('functionAppName')]" + } + } +} \ No newline at end of file diff --git a/Text/Distinct/host.json b/Text/Distinct/host.json new file mode 100644 index 00000000..b9f92c0d --- /dev/null +++ b/Text/Distinct/host.json @@ -0,0 +1,3 @@ +{ + "version": "2.0" +} \ No newline at end of file diff --git a/Text/Distinct/local.settings.json b/Text/Distinct/local.settings.json new file mode 100644 index 00000000..bf70960e --- /dev/null +++ b/Text/Distinct/local.settings.json @@ -0,0 +1,7 @@ +{ + "IsEncrypted": false, + "Values": { + "AzureWebJobsStorage": "", + "FUNCTIONS_WORKER_RUNTIME": "dotnet" + } +} \ No newline at end of file diff --git a/Text/Distinct/thesaurus.json b/Text/Distinct/thesaurus.json new file mode 100644 index 00000000..0ce16577 --- /dev/null +++ b/Text/Distinct/thesaurus.json @@ -0,0 +1,4 @@ +[ + ["Microsoft", "Microsoft Corporation", "Microsoft corp.", "MSFT"], + [ "USA", "United States", "United States of America" ] +] \ No newline at end of file From 539f8a61041924d611bc33bf1972bdbbdc8ad81f Mon Sep 17 00:00:00 2001 From: Bertrand Le Roy Date: Thu, 5 Sep 2019 16:51:51 -0700 Subject: [PATCH 2/3] Address CR comments. --- .../DistinctTests/DistinctTests.cs | 36 +++++++++++++++++++ Text/Distinct/Distinct.cs | 9 ++++- Text/Distinct/README.md | 2 ++ Text/Distinct/Thesaurus.cs | 18 ++++++---- 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/Tests/PowerSkillTests/DistinctTests/DistinctTests.cs b/Tests/PowerSkillTests/DistinctTests/DistinctTests.cs index 8b24beb9..eebdb34b 100644 --- a/Tests/PowerSkillTests/DistinctTests/DistinctTests.cs +++ b/Tests/PowerSkillTests/DistinctTests/DistinctTests.cs @@ -5,6 +5,8 @@ using AzureCognitiveSearch.PowerSkills.Text.Distinct; using System.Collections.Generic; using System.Linq; +using System; +using System.IO; namespace AzureCognitiveSearch.PowerSkills.Tests.DistinctTests { @@ -48,5 +50,39 @@ public void DeduplicateYieldsListOfDistinctCanonicalForms(string text, string ex Assert.IsTrue(expectedTerms.SequenceEqual(deduped), $"Expected [{ string.Join(", ", expectedTerms) }] but was [{ string.Join(", ", deduped) }]."); } + + [TestMethod] + public void ThesaurusBuildsNormalizedSynonymToCanonicalFormDictionaryAndIgnoresEmptyLemmas() + { + const string canonicalAcronym = "acronym"; + const string canonicalMicrosoft = "Microsoft"; + var synonyms = new Thesaurus(new[] + { + new[] { canonicalAcronym, "acornym", "acronyms" }, + Array.Empty(), + new[] { canonicalMicrosoft, "Microsoft Corporation", "Microsoft corp.", "MSFT" } + }).Synonyms; + + Assert.AreEqual(7, synonyms.Count()); + Assert.AreEqual(canonicalAcronym, synonyms["acronym"]); + Assert.AreEqual(canonicalAcronym, synonyms["acornym"]); + Assert.AreEqual(canonicalAcronym, synonyms["acronyms"]); + Assert.AreEqual(canonicalMicrosoft, synonyms["microsoft"]); + Assert.AreEqual(canonicalMicrosoft, synonyms["microsoftcorporation"]); + Assert.AreEqual(canonicalMicrosoft, synonyms["microsoftcorp"]); + Assert.AreEqual(canonicalMicrosoft, synonyms["msft"]); + } + + [TestMethod] + public void ThesaurusConstructorThrowsForDuplicateLemmas() + { + Assert.ThrowsException(() => { + _ = new Thesaurus(new[] + { + new[] {"foo", "bar"}, + new[] {"baz", "bar"} + }); + }); + } } } diff --git a/Text/Distinct/Distinct.cs b/Text/Distinct/Distinct.cs index a0a38466..522e0cd0 100644 --- a/Text/Distinct/Distinct.cs +++ b/Text/Distinct/Distinct.cs @@ -10,6 +10,7 @@ using System.Collections.Generic; using AzureCognitiveSearch.PowerSkills.Common; using Newtonsoft.Json.Linq; +using System; namespace AzureCognitiveSearch.PowerSkills.Text.Distinct { @@ -34,7 +35,13 @@ public static async Task RunDistinct( WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords, (inRecord, outRecord) => { - var words = ((JArray)inRecord.Data["words"]).Values(); + JArray wordsParameter = inRecord.Data.TryGetValue("words", out object wordsParameterObject) ? + wordsParameterObject as JArray : null; + if (wordsParameter is null) + { + throw new ArgumentException("Input data is missing a `words` array of words to de-duplicate.", "words"); + } + var words = wordsParameter.Values(); outRecord.Data["distinct"] = thesaurus.Dedupe(words); return outRecord; }); diff --git a/Text/Distinct/README.md b/Text/Distinct/README.md index be416ec3..bb7598d4 100644 --- a/Text/Distinct/README.md +++ b/Text/Distinct/README.md @@ -14,6 +14,8 @@ azureDeploy: https://raw.githubusercontent.com/Azure-Samples/azure-search-power- This custom skill removes duplicates from a list of terms. +Terms are considered the same if they only differ by casing, separators such as spaces, or punctuation, or if they have a common entry in the thesaurus. + [![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-search-power-skills%2Fmaster%2FText%2FDistinct%2Fazuredeploy.json) ## Requirements diff --git a/Text/Distinct/Thesaurus.cs b/Text/Distinct/Thesaurus.cs index 2d7dd3e1..f6623619 100644 --- a/Text/Distinct/Thesaurus.cs +++ b/Text/Distinct/Thesaurus.cs @@ -18,21 +18,25 @@ public Thesaurus(string executingDirectoryPath) public Thesaurus(IEnumerable> dataset) { - Synonyms = new Dictionary(StringComparer.InvariantCultureIgnoreCase); + Synonyms = new Dictionary(); foreach (IEnumerable lemma in dataset) { + if (!lemma.Any()) continue; string canonicalForm = lemma.First(); foreach (string form in lemma) { - Synonyms.Add(Normalize(form), canonicalForm); + string normalizedForm = Normalize(form); + if (Synonyms.TryGetValue(normalizedForm, out string existingCanonicalForm)) + { + throw new InvalidDataException( + $"Thesaurus parsing error: the form '{form}' of the lemma '{canonicalForm}' looks the same, once normalized, as one of the forms of '{existingCanonicalForm}'. Please disambiguate or merge lemmas."); + } + Synonyms.Add(normalizedForm, canonicalForm); } } } - public Dictionary Synonyms - { - get; private set; - } + public Dictionary Synonyms { get; } public IEnumerable Dedupe(IEnumerable words) { @@ -54,7 +58,7 @@ public IEnumerable Dedupe(IEnumerable words) public static string Normalize(string word) => new string(word .Normalize() - .ToLower() + .ToLowerInvariant() .Where(c => !(char.IsPunctuation(c) || char.IsSeparator(c))) .ToArray()); } From d02a991475cae82e43f7bc9ac90f7853fa8d553c Mon Sep 17 00:00:00 2001 From: Bertrand Le Roy Date: Thu, 5 Sep 2019 16:56:22 -0700 Subject: [PATCH 3/3] Handle exceptions while reading the thesaurus. --- Text/Distinct/Distinct.cs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Text/Distinct/Distinct.cs b/Text/Distinct/Distinct.cs index 522e0cd0..dc76df32 100644 --- a/Text/Distinct/Distinct.cs +++ b/Text/Distinct/Distinct.cs @@ -31,7 +31,14 @@ public static async Task RunDistinct( return new BadRequestObjectResult($"{skillName} - Invalid request record array."); } - Thesaurus thesaurus = new Thesaurus(executionContext.FunctionAppDirectory); + try + { + Thesaurus thesaurus = new Thesaurus(executionContext.FunctionAppDirectory); + } + catch(Exception e) + { + throw new Exception("Failed to read and parse thesaurus.json.", e); + } WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords, (inRecord, outRecord) => {