From ce8ed99dda25d8315d4117f2cce3e7e2d1a0d016 Mon Sep 17 00:00:00 2001 From: LionbridgeCS <50670696+LionbridgeCS@users.noreply.github.com> Date: Wed, 19 Jun 2019 07:53:49 +0300 Subject: [PATCH] Support for Italian Number in .NET (#1604) * Support for Italian Number in .NET * Skipping 3 test cases that currently don't pass * Fix review feedback * Removed commented out code, added Italian-specific comment --- .../Chinese/URLDefinitions.cs | 2 +- .../Italian/NumbersDefinitions.cs | 322 ++++--- .../ItalianNumberParserConfiguration.cs | 68 +- .../NumberRecognizer.cs | 3 +- Patterns/Italian/Italian-Numbers.yaml | 392 ++++---- Specs/Number/Italian/NumberModel.json | 834 ++++++++++++------ Specs/Number/Italian/OrdinalModel.json | 167 +++- Specs/Number/Italian/PercentModel.json | 53 +- 8 files changed, 1218 insertions(+), 623 deletions(-) diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/Chinese/URLDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/Chinese/URLDefinitions.cs index af47044c37..d2e6ec979f 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/Chinese/URLDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/Chinese/URLDefinitions.cs @@ -26,4 +26,4 @@ public static class URLDefinitions public static readonly string UrlRegex = $@"{UrlPrefixRegex}(?[a-zA-Z]{{2,18}}){BaseURL.UrlSuffixRegex}"; public static readonly string IpUrlRegex = $@"(?({ExtractionRestrictionRegex}{BaseURL.ProtocolRegex}({BaseIp.Ipv4Regex}|localhost){BaseURL.UrlSuffixRegex}))"; } -} \ No newline at end of file +} diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/Italian/NumbersDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/Italian/NumbersDefinitions.cs index fd733c1634..a42aa15f43 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/Italian/NumbersDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/Italian/NumbersDefinitions.cs @@ -24,68 +24,66 @@ public static class NumbersDefinitions public const string LangMarker = @"Ita"; public const bool CompoundNumberLanguage = true; public const bool MultiDecimalSeparatorCulture = false; - public const string RoundNumberIntegerRegex = @"(cento|mille|milione|milioni|miliardo|miliardi|trilione|trilioni)"; - public const string ZeroToNineIntegerRegex = @"(e uno|un|uno|una|due|tre|quattro|cinque|sei|sette|otto|nove|zero)"; - public const string TenToNineteenIntegerRegex = @"(diciassette|tredici|quattordici|diciotto|diciannove|quindici|sedici|undici|dodici|dieci)"; - public const string TensNumberIntegerRegex = @"(settanta|venti|trenta|ottanta|novanta|quaranta|cinquanta|sessanta)"; public const string DigitsNumberRegex = @"\d|\d{1,3}(\.\d{3})"; - public static readonly string HundredsNumberIntegerRegex = $@"(({ZeroToNineIntegerRegex}(\s+cento))|cento|((\s+cento\s)+{TensNumberIntegerRegex}))"; - public static readonly string BelowHundredsRegex = $@"(({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}(\W+{ZeroToNineIntegerRegex})?))|{ZeroToNineIntegerRegex})"; - public static readonly string BelowThousandsRegex = $@"(({HundredsNumberIntegerRegex}(\s+{BelowHundredsRegex})?|{BelowHundredsRegex}|{TenToNineteenIntegerRegex})|cento\s+{TenToNineteenIntegerRegex})"; - public static readonly string SupportThousandsRegex = $@"(({BelowThousandsRegex}|{BelowHundredsRegex})\s+{RoundNumberIntegerRegex}(\s+{RoundNumberIntegerRegex})?)"; - public const string NegativeNumberTermsRegex = @"^[.]"; - public static readonly string NegativeNumberSignRegex = $@"^({NegativeNumberTermsRegex}\s+).*"; - public static readonly string SeparaIntRegex = $@"({SupportThousandsRegex}(\s+{SupportThousandsRegex})*(\s+{BelowThousandsRegex})?|{BelowThousandsRegex})"; - public static readonly string AllIntRegex = $@"({SeparaIntRegex}|mille(\s+{BelowThousandsRegex})?)"; + public const string RoundNumberIntegerRegex = @"(cento?|mille?|mila|milion[ei]?|miliard[oi]?|bilion[ei]?|trilion[ei]?)"; + public const string ZeroToNineIntegerRegex = @"(un[oa]?|due?|tre?|quattro?|cinque?|sei|sette?|otto?|nove?|zero)"; + public const string TwoToNineIntegerRegex = @"(due?|tre?|quattro?|cinque?|sei|sette?|otto?|nove?)"; + public const string NegativeNumberTermsRegex = @"(meno\s+)"; + public static readonly string NegativeNumberSignRegex = $@"^{NegativeNumberTermsRegex}.*"; + public const string AnIntRegex = @"(un)(?=\s)"; + public const string TenToNineteenIntegerRegex = @"(diciassette?|tredici?|quattordici?|diciotto?|diciannove?|quindici?|sedici?|undici?|dodici?|dieci?)"; + public const string TensNumberIntegerRegex = @"(settanta?|venti?|trenta?|ottanta?|novanta?|quaranta?|cinquanta?|sessanta?)"; + public static readonly string SeparaIntRegex = $@"((({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}{ZeroToNineIntegerRegex})|{TensNumberIntegerRegex}|{ZeroToNineIntegerRegex})(\s*{RoundNumberIntegerRegex})*))|((({AnIntRegex})?(\s*{RoundNumberIntegerRegex})+))"; + public static readonly string AllIntRegex = $@"(((({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}{ZeroToNineIntegerRegex})|{TensNumberIntegerRegex}|{ZeroToNineIntegerRegex}|({AnIntRegex})?)(\s*{RoundNumberIntegerRegex})+)\s*(e\s+)?)*{SeparaIntRegex})"; + public const string PlaceHolderPureNumber = @"\b"; + public const string PlaceHolderDefault = @"\D|\b"; public static readonly Func NumbersWithPlaceHolder = (placeholder) => $@"(((?<=\W|^)-\s*)|(?<=\b))\d+(?!(,\d+[a-zA-Z]))(?={placeholder})"; public static readonly string NumbersWithSuffix = $@"(((?<=\W|^)-\s*)|(?<=\b))\d+\s*{BaseNumbers.NumberMultiplierRegex}(?=\b)"; public static readonly string RoundNumberIntegerRegexWithLocks = $@"(?<=\b)({DigitsNumberRegex})+\s+{RoundNumberIntegerRegex}(?=\b)"; - public const string NumbersWithDozenSuffix = @"(((?({AllIntRegex})|((?({AllIntRegex})|(\d+)(?!\.))(?=\b)"; + public static readonly string FractionNounRegex = $@"(?<=\b)({AllIntRegex}\s+(e\s+)?)?({AllIntRegex})(\s+|\s*-\s*)(?!\bprimo\b|\bsecondo\b)(mezzi|({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))(?=\b)"; + public static readonly string FractionNounWithArticleRegex = $@"(?<=\b)({AllIntRegex}\s+(e\s+)?)?(un)(\s+|\s*-\s*)(?!\bprimo\b|\bsecondo\b)(mezzo|({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))(?=\b)"; + public static readonly string FractionPrepositionRegex = $@"(?<=\b)(?({AllIntRegex})|((?({AllIntRegex})|(\d+)(?!\.))(?=\b)"; public static readonly string AllPointRegex = $@"((\s+{ZeroToNineIntegerRegex})+|(\s+{SeparaIntRegex}))"; - public static readonly string AllFloatRegex = $@"({AllIntRegex}(\s+(virgule|point)){AllPointRegex})"; + public static readonly string AllFloatRegex = $@"({AllIntRegex}(\s+(virgola|punto)){AllPointRegex})"; public static readonly Func DoubleDecimalPointRegex = (placeholder) => $@"(((? DoubleWithoutIntegralRegex = (placeholder) => $@"(?<=\s|^)(?e)"; public const string DoubleExponentialNotationRegex = @"(((? CardinalNumberMap = new Dictionary { { @"zero", 0 }, @@ -113,24 +111,32 @@ public static class NumbersDefinitions { @"diciotto", 18 }, { @"diciannove", 19 }, { @"venti", 20 }, + { @"vent", 20 }, { @"trenta", 30 }, + { @"trent", 30 }, { @"quaranta", 40 }, + { @"quarant", 40 }, { @"cinquanta", 50 }, + { @"cinquant", 50 }, { @"sessanta", 60 }, + { @"sessant", 60 }, { @"settanta", 70 }, + { @"settant", 70 }, { @"ottanta", 80 }, + { @"ottant", 80 }, { @"novanta", 90 }, + { @"novant", 90 }, { @"cento", 100 }, { @"mille", 1000 }, - { @"un milione", 1000000 }, + { @"mila", 1000 }, { @"milione", 1000000 }, { @"milioni", 1000000 }, - { @"un miliardo", 1000000000 }, { @"miliardo", 1000000000 }, { @"miliardi", 1000000000 }, - { @"un trilione", 1000000000000 }, - { @"trilione", 1000000000000 }, - { @"trilioni", 1000000000000 } + { @"bilione", 1000000000000 }, + { @"bilioni", 1000000000000 }, + { @"trilione", 1000000000000000000 }, + { @"trilioni", 1000000000000000000 } }; public static readonly Dictionary OrdinalNumberMap = new Dictionary { @@ -139,6 +145,8 @@ public static class NumbersDefinitions { @"secondo", 2 }, { @"seconda", 2 }, { @"metà", 2 }, + { @"mezzo", 2 }, + { @"mezza", 2 }, { @"terzo", 3 }, { @"terza", 3 }, { @"quarto", 4 }, @@ -163,8 +171,8 @@ public static class NumbersDefinitions { @"tredicesima", 13 }, { @"quattordicesimo", 14 }, { @"quattordicesima", 14 }, - { @"quindicisimo", 15 }, - { @"quindicisima", 15 }, + { @"quindicesimo", 15 }, + { @"quindicesima", 15 }, { @"sedicesimo", 16 }, { @"sedicesima", 16 }, { @"diciassettesimo", 17 }, @@ -199,12 +207,16 @@ public static class NumbersDefinitions { @"milionesima", 1000000 }, { @"miliardesimo", 1000000000 }, { @"miliardesima", 1000000000 }, - { @"trilionesimo", 1000000000000 }, - { @"trilionesima", 1000000000000 }, + { @"bilionesimo", 1000000000000 }, + { @"bilionesima", 1000000000000 }, + { @"trilionesimo", 1000000000000000000 }, + { @"trilionesima", 1000000000000000000 }, { @"primi", 1 }, { @"prime", 1 }, { @"secondi", 2 }, { @"seconde", 2 }, + { @"mezzi", 2 }, + { @"mezze", 2 }, { @"terzi", 3 }, { @"terze", 3 }, { @"quarti", 4 }, @@ -219,6 +231,42 @@ public static class NumbersDefinitions { @"ottave", 8 }, { @"noni", 9 }, { @"none", 9 }, + { @"unesimo", 1 }, + { @"unesima", 1 }, + { @"unesime", 1 }, + { @"unesimi", 1 }, + { @"duesimo", 2 }, + { @"duesima", 2 }, + { @"duesime", 2 }, + { @"duesimi", 2 }, + { @"treesimo", 3 }, + { @"treesima", 3 }, + { @"treesime", 3 }, + { @"treesimi", 3 }, + { @"quattresimo", 4 }, + { @"quattresima", 4 }, + { @"quattresime", 4 }, + { @"quattresimi", 4 }, + { @"cinquesimo", 5 }, + { @"cinquesima", 5 }, + { @"cinquesime", 5 }, + { @"cinquesimi", 5 }, + { @"seiesimo", 6 }, + { @"seiesima", 6 }, + { @"seiesime", 6 }, + { @"seiesimi", 6 }, + { @"settesimo", 7 }, + { @"settesima", 7 }, + { @"settesime", 7 }, + { @"settesimi", 7 }, + { @"ottesimo", 8 }, + { @"ottesima", 8 }, + { @"ottesime", 8 }, + { @"ottesimi", 8 }, + { @"novesimo", 9 }, + { @"novesima", 9 }, + { @"novesime", 9 }, + { @"novesimi", 9 }, { @"decimi", 10 }, { @"decime", 10 }, { @"undicesimi", 11 }, @@ -261,107 +309,55 @@ public static class NumbersDefinitions { @"millesime", 1000 }, { @"milionesimi", 1000000 }, { @"milionesime", 1000000 }, - { @"miliardersimi", 1000000000 }, - { @"miliardersime", 1000000000 }, - { @"trilioneisimi", 1000000000000 }, - { @"trilionesime", 1000000000000 } - }; - public static readonly Dictionary PrefixCardinalMap = new Dictionary - { - { @"due", 2 }, - { @"tre", 3 }, - { @"quattro", 4 }, - { @"cinque", 5 }, - { @"sei", 6 }, - { @"sette", 7 }, - { @"otto", 8 }, - { @"nove", 9 }, - { @"dieci", 10 }, - { @"undici", 11 }, - { @"dodici", 12 }, - { @"tredici", 13 }, - { @"quattordici", 14 }, - { @"quindici", 15 }, - { @"sedici", 16 }, - { @"diciassette", 17 }, - { @"diciotto", 18 }, - { @"diciannove", 19 }, - { @"venti", 20 }, - { @"ventuno", 21 }, - { @"ventidue", 22 }, - { @"ventitre", 23 }, - { @"ventiquattro", 24 }, - { @"venticinque", 25 }, - { @"ventisei", 26 }, - { @"ventisette", 27 }, - { @"ventotto", 28 }, - { @"ventinove", 29 }, - { @"trenta", 30 }, - { @"trentuno", 31 }, - { @"quaranta", 40 }, - { @"cinquanta", 50 }, - { @"sessanta", 60 }, - { @"settanta", 70 }, - { @"ottanta", 80 }, - { @"novanta", 90 }, - { @"cento", 100 }, - { @"due cento", 200 }, - { @"duecento", 200 }, - { @"tre cento", 300 }, - { @"trecento", 300 }, - { @"quattro cento", 400 }, - { @"quattrocento", 400 }, - { @"cinque cento", 500 }, - { @"cinquecento", 500 }, - { @"sei cento", 600 }, - { @"seicento", 600 }, - { @"sette cento", 700 }, - { @"settecento", 700 }, - { @"otto cento", 800 }, - { @"ottocento", 800 }, - { @"nove cento", 900 }, - { @"novecento", 900 } - }; - public static readonly Dictionary SuffixOrdinalMap = new Dictionary - { - { @"mille", 1000 }, - { @"milione", 1000000 }, - { @"miliardo", 1000000000 } + { @"miliardesimi", 1000000000 }, + { @"miliardesime", 1000000000 }, + { @"bilionesimi", 1000000000000 }, + { @"bilionesime", 1000000000000 }, + { @"trilionesimi", 1000000000000000000 }, + { @"trilionesime", 1000000000000000000 } }; public static readonly Dictionary RoundNumberMap = new Dictionary { { @"cento", 100 }, { @"mille", 1000 }, + { @"mila", 1000 }, { @"milione", 1000000 }, { @"milioni", 1000000 }, { @"miliardo", 1000000000 }, { @"miliardi", 1000000000 }, - { @"trilione", 1000000000000 }, - { @"trilioni", 1000000000000 }, + { @"bilione", 1000000000000 }, + { @"bilioni", 1000000000000 }, + { @"trilione", 1000000000000000000 }, + { @"trilioni", 1000000000000000000 }, { @"centinaio", 100 }, { @"centinai", 100 }, { @"centinaie", 100 }, { @"millesimo", 1000 }, { @"milionesimo", 1000000 }, - { @"miliardersimo", 1000000000 }, - { @"trilioneisimo", 1000000000000 }, + { @"miliardesimo", 1000000000 }, + { @"bilionesimo", 1000000000000 }, + { @"trilionesimo", 1000000000000000000 }, { @"millesima", 1000 }, { @"milionesima", 1000000 }, - { @"miliardersima", 1000000000 }, - { @"trilioneisima", 1000000000000 }, + { @"miliardesima", 1000000000 }, + { @"bilionesima", 1000000000000 }, + { @"trilionesima", 1000000000000000000 }, { @"millesimi", 1000 }, { @"milionesimi", 1000000 }, - { @"miliardersimi", 1000000000 }, - { @"trilioneisimi", 1000000000000 }, + { @"miliardesimi", 1000000000 }, + { @"bilionesimi", 1000000000000 }, + { @"trilionesimi", 1000000000000000000 }, { @"millesime", 1000 }, { @"milionesime", 1000000 }, - { @"miliardersime", 1000000000 }, - { @"trilioneisime", 1000000000000 }, + { @"miliardesime", 1000000000 }, + { @"bilionesime", 1000000000000 }, + { @"trilionesime", 1000000000000000000 }, { @"centinaia", 100 }, { @"migliaia", 1000 }, { @"milionata", 1000000 }, { @"miliardata", 1000000000 }, - { @"trilionata", 1000000000000 }, + { @"bilionata", 1000000000000 }, + { @"trilionata", 1000000000000000000 }, { @"dozzina", 12 }, { @"dozzine", 12 }, { @"k", 1000 }, @@ -372,11 +368,81 @@ public static class NumbersDefinitions }; public static readonly Dictionary RelativeReferenceOffsetMap = new Dictionary { - { @"", @"" } + { @"ultimo", @"0" }, + { @"ultima", @"0" }, + { @"ultimi", @"0" }, + { @"ultime", @"0" }, + { @"successivo", @"1" }, + { @"successiva", @"1" }, + { @"successivi", @"1" }, + { @"successive", @"1" }, + { @"prossimo", @"1" }, + { @"prossima", @"1" }, + { @"prossimi", @"1" }, + { @"prossime", @"1" }, + { @"seguente", @"1" }, + { @"seguenti", @"1" }, + { @"precedente", @"-1" }, + { @"precedenti", @"-1" }, + { @"penultimo", @"-1" }, + { @"penultima", @"-1" }, + { @"penultimi", @"-1" }, + { @"penultime", @"-1" }, + { @"terz'ultimo", @"-2" }, + { @"terz'ultima", @"-2" }, + { @"terz'ultimi", @"-2" }, + { @"terz'ultime", @"-2" }, + { @"terzultimo", @"-2" }, + { @"terzultima", @"-2" }, + { @"terzultimi", @"-2" }, + { @"terzultime", @"-2" }, + { @"quart'ultimo", @"-3" }, + { @"quart'ultima", @"-3" }, + { @"quart'ultimi", @"-3" }, + { @"quart'ultime", @"-3" }, + { @"quartultimo", @"-3" }, + { @"quartultima", @"-3" }, + { @"quartultimi", @"-3" }, + { @"quartultime", @"-3" } }; public static readonly Dictionary RelativeReferenceRelativeToMap = new Dictionary { - { @"", @"" } + { @"ultimo", @"end" }, + { @"ultima", @"end" }, + { @"ultimi", @"end" }, + { @"ultime", @"end" }, + { @"successivo", @"current" }, + { @"successiva", @"current" }, + { @"successivi", @"current" }, + { @"successive", @"current" }, + { @"prossimo", @"current" }, + { @"prossima", @"current" }, + { @"prossimi", @"current" }, + { @"prossime", @"current" }, + { @"seguente", @"current" }, + { @"seguenti", @"current" }, + { @"precedente", @"current" }, + { @"precedenti", @"current" }, + { @"penultimo", @"end" }, + { @"penultima", @"end" }, + { @"penultimi", @"end" }, + { @"penultime", @"end" }, + { @"terz'ultimo", @"end" }, + { @"terz'ultima", @"end" }, + { @"terz'ultimi", @"end" }, + { @"terz'ultime", @"end" }, + { @"terzultimo", @"end" }, + { @"terzultima", @"end" }, + { @"terzultimi", @"end" }, + { @"terzultime", @"end" }, + { @"quart'ultimo", @"end" }, + { @"quart'ultima", @"end" }, + { @"quart'ultimi", @"end" }, + { @"quart'ultime", @"end" }, + { @"quartultimo", @"end" }, + { @"quartultima", @"end" }, + { @"quartultimi", @"end" }, + { @"quartultime", @"end" } }; } } \ No newline at end of file diff --git a/.NET/Microsoft.Recognizers.Text.Number/Italian/Parsers/ItalianNumberParserConfiguration.cs b/.NET/Microsoft.Recognizers.Text.Number/Italian/Parsers/ItalianNumberParserConfiguration.cs index e673387deb..1194b4da57 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Italian/Parsers/ItalianNumberParserConfiguration.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Italian/Parsers/ItalianNumberParserConfiguration.cs @@ -1,6 +1,7 @@ using System.Collections.Generic; using System.Collections.Immutable; using System.Globalization; +using System.Linq; using System.Text; using System.Text.RegularExpressions; @@ -10,6 +11,12 @@ namespace Microsoft.Recognizers.Text.Number.Italian { public class ItalianNumberParserConfiguration : BaseNumberParserConfiguration { + public ItalianNumberParserConfiguration(NumberOptions options) + : this() + { + this.Options = options; + } + public ItalianNumberParserConfiguration() : this(new CultureInfo(Culture.Italian)) { @@ -34,20 +41,56 @@ public ItalianNumberParserConfiguration(CultureInfo ci) this.WrittenFractionSeparatorTexts = NumbersDefinitions.WrittenFractionSeparatorTexts; this.CardinalNumberMap = NumbersDefinitions.CardinalNumberMap.ToImmutableDictionary(); - this.OrdinalNumberMap = NumberMapGenerator.InitOrdinalNumberMap(NumbersDefinitions.OrdinalNumberMap, NumbersDefinitions.PrefixCardinalMap, NumbersDefinitions.SuffixOrdinalMap); - RelativeReferenceOffsetMap = NumbersDefinitions.RelativeReferenceOffsetMap.ToImmutableDictionary(); - RelativeReferenceRelativeToMap = NumbersDefinitions.RelativeReferenceRelativeToMap.ToImmutableDictionary(); + this.OrdinalNumberMap = NumbersDefinitions.OrdinalNumberMap.ToImmutableDictionary(); + this.RelativeReferenceOffsetMap = NumbersDefinitions.RelativeReferenceOffsetMap.ToImmutableDictionary(); + this.RelativeReferenceRelativeToMap = NumbersDefinitions.RelativeReferenceRelativeToMap.ToImmutableDictionary(); this.RoundNumberMap = NumbersDefinitions.RoundNumberMap.ToImmutableDictionary(); this.HalfADozenRegex = new Regex(NumbersDefinitions.HalfADozenRegex, RegexOptions.Singleline); this.DigitalNumberRegex = new Regex(NumbersDefinitions.DigitalNumberRegex, RegexOptions.Singleline); this.NegativeNumberSignRegex = new Regex(NumbersDefinitions.NegativeNumberSignRegex, RegexOptions.Singleline); + this.FractionPrepositionRegex = new Regex(NumbersDefinitions.FractionPrepositionRegex, RegexOptions.Singleline); + this.OneToNineOrdinalRegex = new Regex(NumbersDefinitions.OneToNineOrdinalRegex, RegexOptions.Singleline); } public string NonDecimalSeparatorText { get; private set; } + public Regex OneToNineOrdinalRegex { get; } + public override IEnumerable NormalizeTokenSet(IEnumerable tokens, ParseResult context) { - return tokens; + var fracWords = new List(); + var tokenList = tokens.ToList(); + var tokenLen = tokenList.Count; + + for (var i = 0; i < tokenLen; i++) + { + if ((i < tokenLen - 2) && tokenList[i + 1] == "-") + { + fracWords.Add(tokenList[i] + tokenList[i + 1] + tokenList[i + 2]); + i += 2; + } + else + { + fracWords.Add(tokenList[i]); + } + } + + /*The following piece of code is needed in Italian to correctly compute some fraction patterns + * e.g. 'due milioni duemiladuecento quinti' (=2002200/5) which is otherwise interpreted as + * 2000000/2205 (in Italian, isolated ordinals <10 have a different form respect to when + * they are concatenated to other numbers, so the following lines try to keep them isolated + * by concatenating the two previous numbers) */ + var fracLen = fracWords.Count; + if (fracLen > 2 && this.OneToNineOrdinalRegex.Match(fracWords[fracLen - 1]).Success) + { + if (fracWords[fracLen - 3] != "e" && fracWords[fracLen - 2] != "e") + { + fracWords[fracLen - 3] += fracWords[fracLen - 2]; + fracWords.RemoveAt(fracLen - 2); + } + } + + return fracWords; } public override long ResolveCompositeNumber(string numberStr) @@ -63,6 +106,8 @@ public override long ResolveCompositeNumber(string numberStr) } long value = 0; + long prevValue = 0; + long finalValue = 0; var strBuilder = new StringBuilder(); int lastGoodChar = 0; @@ -78,7 +123,22 @@ public override long ResolveCompositeNumber(string numberStr) if ((i + 1) == numberStr.Length) { + if (prevValue > 0 && value > prevValue) + { + value = (prevValue * value) - prevValue; + } + + if (prevValue < 1000) + { + prevValue = value + prevValue; + } + else + { + prevValue = value; + } + finalValue += value; + strBuilder.Clear(); i = lastGoodChar++; value = 0; diff --git a/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs b/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs index a9015ad287..fc44959ef3 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs @@ -5,6 +5,7 @@ using Microsoft.Recognizers.Text.Number.English; using Microsoft.Recognizers.Text.Number.French; using Microsoft.Recognizers.Text.Number.German; +using Microsoft.Recognizers.Text.Number.Italian; using Microsoft.Recognizers.Text.Number.Japanese; using Microsoft.Recognizers.Text.Number.Korean; using Microsoft.Recognizers.Text.Number.Portuguese; @@ -204,7 +205,6 @@ protected override void InitializeConfiguration() AgnosticNumberParserFactory.GetParser(AgnosticNumberParserType.Percentage, new GermanNumberParserConfiguration()), new German.PercentageExtractor())); - /* RegisterModel( Culture.Italian, (options) => new NumberModel( @@ -222,7 +222,6 @@ protected override void InitializeConfiguration() (options) => new PercentModel( AgnosticNumberParserFactory.GetParser(AgnosticNumberParserType.Percentage, new ItalianNumberParserConfiguration()), new Italian.PercentageExtractor())); - */ RegisterModel( Culture.Japanese, diff --git a/Patterns/Italian/Italian-Numbers.yaml b/Patterns/Italian/Italian-Numbers.yaml index 9fde158d4a..c6ccf7cc03 100644 --- a/Patterns/Italian/Italian-Numbers.yaml +++ b/Patterns/Italian/Italian-Numbers.yaml @@ -5,41 +5,35 @@ CompoundNumberLanguage: !bool true #Does this culture uses period and comma intercheangeably as decimal separator? MultiDecimalSeparatorCulture: !bool false #Integer Regex -RoundNumberIntegerRegex: !simpleRegex - def: (cento|mille|milione|milioni|miliardo|miliardi|trilione|trilioni) -ZeroToNineIntegerRegex: !simpleRegex - def: (e uno|un|uno|una|due|tre|quattro|cinque|sei|sette|otto|nove|zero) -TenToNineteenIntegerRegex: !simpleRegex - def: (diciassette|tredici|quattordici|diciotto|diciannove|quindici|sedici|undici|dodici|dieci) -TensNumberIntegerRegex: !simpleRegex - def: (settanta|venti|trenta|ottanta|novanta|quaranta|cinquanta|sessanta) DigitsNumberRegex: !simpleRegex def: \d|\d{1,3}(\.\d{3}) -HundredsNumberIntegerRegex: !nestedRegex - # TODO: Work on this one - def: (({ZeroToNineIntegerRegex}(\s+cento))|cento|((\s+cento\s)+{TensNumberIntegerRegex})) - references: [ ZeroToNineIntegerRegex, TensNumberIntegerRegex ] -BelowHundredsRegex: !nestedRegex - def: (({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}(\W+{ZeroToNineIntegerRegex})?))|{ZeroToNineIntegerRegex}) - references: [ TenToNineteenIntegerRegex, TensNumberIntegerRegex, ZeroToNineIntegerRegex ] -BelowThousandsRegex: !nestedRegex - def: (({HundredsNumberIntegerRegex}(\s+{BelowHundredsRegex})?|{BelowHundredsRegex}|{TenToNineteenIntegerRegex})|cento\s+{TenToNineteenIntegerRegex}) - references: [ HundredsNumberIntegerRegex, BelowHundredsRegex, TenToNineteenIntegerRegex ] -SupportThousandsRegex: !nestedRegex - def: (({BelowThousandsRegex}|{BelowHundredsRegex})\s+{RoundNumberIntegerRegex}(\s+{RoundNumberIntegerRegex})?) - references: [ BelowThousandsRegex, BelowHundredsRegex, RoundNumberIntegerRegex ] +RoundNumberIntegerRegex: !simpleRegex + def: (cento?|mille?|mila|milion[ei]?|miliard[oi]?|bilion[ei]?|trilion[ei]?) +ZeroToNineIntegerRegex: !simpleRegex + def: (un[oa]?|due?|tre?|quattro?|cinque?|sei|sette?|otto?|nove?|zero) +TwoToNineIntegerRegex: !simpleRegex + def: (due?|tre?|quattro?|cinque?|sei|sette?|otto?|nove?) NegativeNumberTermsRegex: !simpleRegex - # TODO: modify below regex according to the counterpart in English - def: ^[.] + def: (meno\s+) NegativeNumberSignRegex: !nestedRegex - def: ^({NegativeNumberTermsRegex}\s+).* + def: ^{NegativeNumberTermsRegex}.* references: [ NegativeNumberTermsRegex ] +AnIntRegex: !simpleRegex + def: (un)(?=\s) +TenToNineteenIntegerRegex: !simpleRegex + def: (diciassette?|tredici?|quattordici?|diciotto?|diciannove?|quindici?|sedici?|undici?|dodici?|dieci?) +TensNumberIntegerRegex: !simpleRegex + def: (settanta?|venti?|trenta?|ottanta?|novanta?|quaranta?|cinquanta?|sessanta?) SeparaIntRegex: !nestedRegex - def: ({SupportThousandsRegex}(\s+{SupportThousandsRegex})*(\s+{BelowThousandsRegex})?|{BelowThousandsRegex}) - references: [ SupportThousandsRegex, BelowThousandsRegex ] + def: ((({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}{ZeroToNineIntegerRegex})|{TensNumberIntegerRegex}|{ZeroToNineIntegerRegex})(\s*{RoundNumberIntegerRegex})*))|((({AnIntRegex})?(\s*{RoundNumberIntegerRegex})+)) + references: [ TenToNineteenIntegerRegex, TensNumberIntegerRegex, ZeroToNineIntegerRegex, RoundNumberIntegerRegex, AnIntRegex ] AllIntRegex: !nestedRegex - def: ({SeparaIntRegex}|mille(\s+{BelowThousandsRegex})?) - references: [ SeparaIntRegex, BelowThousandsRegex ] + def: (((({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}{ZeroToNineIntegerRegex})|{TensNumberIntegerRegex}|{ZeroToNineIntegerRegex}|({AnIntRegex})?)(\s*{RoundNumberIntegerRegex})+)\s*(e\s+)?)*{SeparaIntRegex}) + references: [ TenToNineteenIntegerRegex, TensNumberIntegerRegex, ZeroToNineIntegerRegex, AnIntRegex, RoundNumberIntegerRegex, SeparaIntRegex ] +PlaceHolderPureNumber: !simpleRegex + def: \b +PlaceHolderDefault: !simpleRegex + def: \D|\b NumbersWithPlaceHolder: !paramsRegex def: (((?<=\W|^)-\s*)|(?<=\b))\d+(?!(,\d+[a-zA-Z]))(?={placeholder}) params: [ placeholder ] @@ -50,76 +44,64 @@ RoundNumberIntegerRegexWithLocks: !nestedRegex def: (?<=\b)({DigitsNumberRegex})+\s+{RoundNumberIntegerRegex}(?=\b) references: [ DigitsNumberRegex, RoundNumberIntegerRegex ] NumbersWithDozenSuffix: !simpleRegex - def: (((?({AllIntRegex})|((?({AllIntRegex})|(\d+)(?!\.))(?=\b) + def: (?<=\b)(?({AllIntRegex})|((?({AllIntRegex})|(\d+)(?!\.))(?=\b) references: [ AllIntRegex ] #Double Regex AllPointRegex: !nestedRegex def: ((\s+{ZeroToNineIntegerRegex})+|(\s+{SeparaIntRegex})) references: [ ZeroToNineIntegerRegex, SeparaIntRegex ] AllFloatRegex: !nestedRegex - def: ({AllIntRegex}(\s+(virgule|point)){AllPointRegex}) + def: '({AllIntRegex}(\s+(virgola|punto)){AllPointRegex})' references: [ AllIntRegex, AllPointRegex ] DoubleDecimalPointRegex: !paramsRegex def: (((?e) DoubleExponentialNotationRegex: !simpleRegex def: (((?