Skip to content

Commit

Permalink
Address CR comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
bleroy committed Sep 5, 2019
1 parent 3267806 commit 539f8a6
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 8 deletions.
36 changes: 36 additions & 0 deletions Tests/PowerSkillTests/DistinctTests/DistinctTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
using AzureCognitiveSearch.PowerSkills.Text.Distinct;
using System.Collections.Generic;
using System.Linq;
using System;
using System.IO;

namespace AzureCognitiveSearch.PowerSkills.Tests.DistinctTests
{
Expand Down Expand Up @@ -48,5 +50,39 @@ public void DeduplicateYieldsListOfDistinctCanonicalForms(string text, string ex

Assert.IsTrue(expectedTerms.SequenceEqual(deduped), $"Expected [{ string.Join(", ", expectedTerms) }] but was [{ string.Join(", ", deduped) }].");
}

[TestMethod]
public void ThesaurusBuildsNormalizedSynonymToCanonicalFormDictionaryAndIgnoresEmptyLemmas()
{
const string canonicalAcronym = "acronym";
const string canonicalMicrosoft = "Microsoft";
var synonyms = new Thesaurus(new[]
{
new[] { canonicalAcronym, "acornym", "acronyms" },
Array.Empty<string>(),
new[] { canonicalMicrosoft, "Microsoft Corporation", "Microsoft corp.", "MSFT" }
}).Synonyms;

Assert.AreEqual(7, synonyms.Count());
Assert.AreEqual(canonicalAcronym, synonyms["acronym"]);
Assert.AreEqual(canonicalAcronym, synonyms["acornym"]);
Assert.AreEqual(canonicalAcronym, synonyms["acronyms"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["microsoft"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["microsoftcorporation"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["microsoftcorp"]);
Assert.AreEqual(canonicalMicrosoft, synonyms["msft"]);
}

[TestMethod]
public void ThesaurusConstructorThrowsForDuplicateLemmas()
{
Assert.ThrowsException<InvalidDataException>(() => {
_ = new Thesaurus(new[]
{
new[] {"foo", "bar"},
new[] {"baz", "bar"}
});
});
}
}
}
9 changes: 8 additions & 1 deletion Text/Distinct/Distinct.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
using System.Collections.Generic;
using AzureCognitiveSearch.PowerSkills.Common;
using Newtonsoft.Json.Linq;
using System;

namespace AzureCognitiveSearch.PowerSkills.Text.Distinct
{
Expand All @@ -34,7 +35,13 @@ public static async Task<IActionResult> RunDistinct(
WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords,
(inRecord, outRecord) =>
{
var words = ((JArray)inRecord.Data["words"]).Values<string>();
JArray wordsParameter = inRecord.Data.TryGetValue("words", out object wordsParameterObject) ?
wordsParameterObject as JArray : null;
if (wordsParameter is null)
{
throw new ArgumentException("Input data is missing a `words` array of words to de-duplicate.", "words");
}
var words = wordsParameter.Values<string>();
outRecord.Data["distinct"] = thesaurus.Dedupe(words);
return outRecord;
});
Expand Down
2 changes: 2 additions & 0 deletions Text/Distinct/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ azureDeploy: https://raw.githubusercontent.com/Azure-Samples/azure-search-power-

This custom skill removes duplicates from a list of terms.

Terms are considered the same if they only differ by casing, separators such as spaces, or punctuation, or if they have a common entry in the thesaurus.

[![Deploy to Azure](https://azuredeploy.net/deploybutton.svg)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-search-power-skills%2Fmaster%2FText%2FDistinct%2Fazuredeploy.json)

## Requirements
Expand Down
18 changes: 11 additions & 7 deletions Text/Distinct/Thesaurus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,25 @@ public Thesaurus(string executingDirectoryPath)

public Thesaurus(IEnumerable<IEnumerable<string>> dataset)
{
Synonyms = new Dictionary<string, string>(StringComparer.InvariantCultureIgnoreCase);
Synonyms = new Dictionary<string, string>();
foreach (IEnumerable<string> lemma in dataset)
{
if (!lemma.Any()) continue;
string canonicalForm = lemma.First();
foreach (string form in lemma)
{
Synonyms.Add(Normalize(form), canonicalForm);
string normalizedForm = Normalize(form);
if (Synonyms.TryGetValue(normalizedForm, out string existingCanonicalForm))
{
throw new InvalidDataException(
$"Thesaurus parsing error: the form '{form}' of the lemma '{canonicalForm}' looks the same, once normalized, as one of the forms of '{existingCanonicalForm}'. Please disambiguate or merge lemmas.");
}
Synonyms.Add(normalizedForm, canonicalForm);
}
}
}

public Dictionary<string, string> Synonyms
{
get; private set;
}
public Dictionary<string, string> Synonyms { get; }

public IEnumerable<string> Dedupe(IEnumerable<string> words)
{
Expand All @@ -54,7 +58,7 @@ public IEnumerable<string> Dedupe(IEnumerable<string> words)
public static string Normalize(string word)
=> new string(word
.Normalize()
.ToLower()
.ToLowerInvariant()
.Where(c => !(char.IsPunctuation(c) || char.IsSeparator(c)))
.ToArray());
}
Expand Down

0 comments on commit 539f8a6

Please sign in to comment.