Minor cleanup in QueryProcessor in C#, method to check Java version, …

…and code format in Python resource code writer. (#967)
microsoft · Nov 12, 2018 · e83d8e8 · e83d8e8
1 parent ba48160
commit e83d8e8
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 19 deletions.
diff --git a/.NET/Microsoft.Recognizers.Text/Utilities/QueryProcessor.cs b/.NET/Microsoft.Recognizers.Text/Utilities/QueryProcessor.cs
@@ -45,30 +45,32 @@ public static string Preprocess(string query, bool caseSensitive = false, bool r
             return query;
         }
 
-        static readonly string Tokens = @"(kB|K[Bb]|K|M[Bb]|M|G[Bb]|G|B)";
-        //static readonly string Expression = @"(?<=(\s|\b\d+))" + Tokens + @"\b";
-        static readonly string Expression = @"(?<=(\s|\d))" + Tokens + @"\b";
-        static readonly Regex SpecialTokensRegex = new Regex(Expression, RegexOptions.Compiled);
+        private static readonly string Expression = @"(?<=(\s|\d))(kB|K[Bb]|K|M[Bb]|M|G[Bb]|G|B)\b";
+        private static readonly Regex SpecialTokensRegex = new Regex(Expression, RegexOptions.Compiled);
 
-        private static void ApplyReverse(int idx, char[] str, string value)
+        private static void ReApplyValue(int idx, ref StringBuilder outString, string value)
         {
             for (int i = 0; i < value.Length; ++i)
             {
-                str[idx + i] = value[i];
+                outString[idx + i] = value[i];
             }
         }
 
         public static string ToLowerTermSensitive(string input)
         {
-            var result = input.ToLowerInvariant().ToCharArray();
+            var lowercase = input.ToLowerInvariant();
+            var buffer = new StringBuilder(lowercase);
+
+            var replaced = false;
 
             var matches = SpecialTokensRegex.Matches(input);
             foreach (Match m in matches)
             {
-                ApplyReverse(m.Index, result, m.Value);
+                ReApplyValue(m.Index, ref buffer, m.Value);
+                replaced = true;
             }
 
-            return new string(result);
+            return replaced ? buffer.ToString() : lowercase;
         }
 
         public static string RemoveDiacritics(string query)

diff --git a/...ecognizers-text/src/main/java/com/microsoft/recognizers/text/utilities/RegExpUtility.java b/...ecognizers-text/src/main/java/com/microsoft/recognizers/text/utilities/RegExpUtility.java
@@ -2,6 +2,7 @@
 
 import org.javatuples.Pair;
 
+import java.math.BigDecimal;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -18,6 +19,15 @@ public abstract class RegExpUtility {
     private static final String groupNameIndexSep = "iii";
     private static final String groupNameIndexSepRegex = Pattern.quote(groupNameIndexSep);
 
+    private static final boolean unboundedLookBehindNotSupported = isRestrictedJavaVersion();
+
+    private static final Pattern lookBehindCheckRegex = Pattern.compile("(\\?<[!=][^)]+)([+*])");
+    private static final Map<String, String> bindings = new HashMap<String, String>(){{
+
+        put("+", "{1,10}");
+        put("*", "{0,10}");
+    }};
+
     public static Pattern getSafeRegExp(String source) {
         return getSafeRegExp(source, 0);
     }
@@ -45,7 +55,7 @@ public static Map<String, String> getNamedGroups(Matcher groupedMatcher, boolean
                 groupName = groupName.replace("ii", "_");
             }
 
-            //If matchedGroups previously contained a mapping for groupName, the old value is replaced.
+            // If matchedGroups previously contained a mapping for groupName, the old value is replaced.
             if (groupValue != null) {
                 matchedGroups.put(groupName, groupValue);
             }
@@ -131,8 +141,11 @@ public static Match[] getMatches(Pattern regex, String source) {
     }
 
     private static String sanitizeGroups(String source) {
+
+        String result = source;
+
         AtomicInteger index = new AtomicInteger(0);
-        String result = replace(source, matchGroup, (Matcher m) -> m.group(0).replace(m.group(1), m.group(1).replace("_", "ii") + groupNameIndexSep + index.getAndIncrement()));
+        result = replace(result, matchGroup, (Matcher m) -> m.group(0).replace(m.group(1), m.group(1).replace("_", "ii") + groupNameIndexSep + index.getAndIncrement()));
 
         index.set(0);
         result = replace(result, matchPositiveLookbehind, (Matcher m) -> String.format("(?<plb%s%s>", groupNameIndexSep, index.getAndIncrement()));
@@ -143,6 +156,30 @@ private static String sanitizeGroups(String source) {
         return result;
     }
 
+    public static Pattern getSafeLookbehindRegExp(String source, int flags) {
+
+        String result = source;
+
+        // Java pre 1.9 doesn't support unbounded lookbehind lengths
+        if (unboundedLookBehindNotSupported) {
+             //result = bindLookbehinds(result);
+        }
+
+        return Pattern.compile(result, flags);
+    }
+
+    private static String bindLookbehinds(String regex) {
+
+        String result = regex;
+
+        Matcher matcher = lookBehindCheckRegex.matcher(regex);
+        if (matcher.find()) {
+            result = matcher.replaceAll(matcher.group(1) + bindings.get(matcher.group(2)));
+        }
+
+        return result;
+    }
+
     private static Match[] getMatchesSimple(Pattern regex, String source) {
         List<Match> matches = new ArrayList<>();
 
@@ -263,4 +300,33 @@ public static String replace(String input, Pattern regex, StringReplacerCallback
 
         return resultString.toString();
     }
+
+    // Checks if Java version is <= 8, as they don't support look-behind groups with no maximum length.
+    private static boolean isRestrictedJavaVersion() {
+
+        boolean result = false;
+        BigDecimal targetVersion = new BigDecimal( "1.8" );
+
+        try {
+            String specVersion = System.getProperty("java.specification.version");
+            result = new BigDecimal( specVersion ).compareTo( targetVersion ) >= 0;
+        } catch (Exception e) {
+            // Nothing to do, ignore.
+        }
+
+        try {
+            // Could also be "java.version", but runtime has more info.
+            String runtimeVersion = System.getProperty("java.runtime.version");
+            result = new BigDecimal( runtimeVersion ).compareTo( targetVersion ) >= 0;
+
+        } catch (Exception e) {
+            // Nothing to do, ignore.
+        }
+
+        if (result) {
+            System.out.println("WARN: Look-behind groups with no maximum length not supported. Java version <= 8.");
+        }
+
+        return result;
+    }
 }
diff --git a/Python/libraries/resource-generator/lib/code_writer.py b/Python/libraries/resource-generator/lib/code_writer.py
@@ -1,6 +1,7 @@
 import abc, json, re
 from .yaml_parser import SimpleRegex, NestedRegex, ParamsRegex, Dictionary, List
 
+
 class CodeWriter:
     def __init__(self, name):
         self.name = name
@@ -9,42 +10,50 @@ def __init__(self, name):
     def write(self):
         pass
 
+
 class DefaultWriter(CodeWriter):
     def __init__(self, name, definition):
         CodeWriter.__init__(self, name)
         self.definition = sanitize(definition)
+
     def write(self):
         return f'{self.name} = \'{self.definition}\''
 
+
 class SimpleRegexWriter(CodeWriter):
     def __init__(self, name, definition):
         CodeWriter.__init__(self, name)
         self.definition = sanitize(definition)
+
     def write(self):
         return f'{self.name} = f\'{self.definition}\''
 
+
 class NestedRegexWriter(SimpleRegexWriter):
     def __init__(self, name, definition, references):
         CodeWriter.__init__(self, name)
         self.definition = sanitize(definition, None, references)
 
+
 class ParamsRegexWriter(SimpleRegexWriter):
     def __init__(self, name, definition, params):
         CodeWriter.__init__(self, name)
         self.definition = sanitize(definition, None, params)
         self.params = ', '.join(params)
+
     def write(self):
         return f'{self.name} = lambda {self.params}: f\'{self.definition}\''
 
+
 class DictionaryWriter(CodeWriter):
     def __init__(self, name, key_type, value_type, entries):
         CodeWriter.__init__(self, name)
         self.entries = []
         key_type = to_python_type(key_type)
         value_type = to_python_type(value_type)
 
-        key_quote = '\'' if key_type=='string' else ''
-        value_quote = '\'' if value_type=='string' else ''
+        key_quote = '\'' if key_type == 'string' else ''
+        value_quote = '\'' if value_type == 'string' else ''
         for key, value in entries.items():
             k = key.replace(r"\'", '\'').replace('\'', r"\'")
             if isinstance(value, list):
@@ -55,26 +64,30 @@ def __init__(self, name, key_type, value_type, entries):
             self.entries.append(f'({key_quote}{k}{key_quote}, {value_quote}{v}{value_quote})')
 
     def write(self):
-        spaces = ' ' * (len(f'{self.name} = dict([')+4)
+        spaces = ' ' * (len(f'{self.name} = dict([') + 4)
         joined_entries = f',\n{spaces}'.join(self.entries)
         return f'{self.name} = dict([{joined_entries}])'
 
+
 class ArrayWriter(CodeWriter):
     def __init__(self, name, value_type, entries):
         CodeWriter.__init__(self, name)
         self.entries = []
         value_type = to_python_type(value_type)
 
-        value_quote = '\'' if value_type=='string' else ''
+        value_quote = '\'' if value_type == 'string' else ''
+
         for value in entries:
+            value = value.replace('\'', '\\\'')
             self.entries.append(f'{value_quote}{value}{value_quote}')
-    
+
     def write(self):
         joined_entries = ', '.join(self.entries)
         return f'{self.name} = [{joined_entries}]'
 
-def sanitize(value: str, value_type = None, tokens = None):
-    value = value.replace('{','{{').replace('}','}}')
+
+def sanitize(value: str, value_type=None, tokens=None):
+    value = value.replace('{', '{{').replace('}', '}}')
     if tokens:
         for token in tokens:
             value = value.replace(f'{{{token}}}', token)
@@ -84,7 +97,8 @@ def sanitize(value: str, value_type = None, tokens = None):
     except:
         stringified = '"' + value + '"'
 
-    return stringified[1:len(stringified)-1].replace("'", r"\'")
+    return stringified[1:len(stringified) - 1].replace("'", r"\'")
+
 
 def to_python_type(type_: str) -> str:
     if type_ == 'long':
@@ -94,6 +108,7 @@ def to_python_type(type_: str) -> str:
     else:
         return type_
 
+
 def generate_code(root):
     lines = []
     for token_name in root: