Skip to content

Commit

Permalink
Minor cleanup in QueryProcessor in C#, method to check Java version, …
Browse files Browse the repository at this point in the history
…and code format in Python resource code writer. (#967)
  • Loading branch information
tellarin authored and wgx998877 committed Nov 12, 2018
1 parent ba48160 commit e83d8e8
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 19 deletions.
20 changes: 11 additions & 9 deletions .NET/Microsoft.Recognizers.Text/Utilities/QueryProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,30 +45,32 @@ public static string Preprocess(string query, bool caseSensitive = false, bool r
return query;
}

static readonly string Tokens = @"(kB|K[Bb]|K|M[Bb]|M|G[Bb]|G|B)";
//static readonly string Expression = @"(?<=(\s|\b\d+))" + Tokens + @"\b";
static readonly string Expression = @"(?<=(\s|\d))" + Tokens + @"\b";
static readonly Regex SpecialTokensRegex = new Regex(Expression, RegexOptions.Compiled);
private static readonly string Expression = @"(?<=(\s|\d))(kB|K[Bb]|K|M[Bb]|M|G[Bb]|G|B)\b";
private static readonly Regex SpecialTokensRegex = new Regex(Expression, RegexOptions.Compiled);

private static void ApplyReverse(int idx, char[] str, string value)
private static void ReApplyValue(int idx, ref StringBuilder outString, string value)
{
for (int i = 0; i < value.Length; ++i)
{
str[idx + i] = value[i];
outString[idx + i] = value[i];
}
}

public static string ToLowerTermSensitive(string input)
{
var result = input.ToLowerInvariant().ToCharArray();
var lowercase = input.ToLowerInvariant();
var buffer = new StringBuilder(lowercase);

var replaced = false;

var matches = SpecialTokensRegex.Matches(input);
foreach (Match m in matches)
{
ApplyReverse(m.Index, result, m.Value);
ReApplyValue(m.Index, ref buffer, m.Value);
replaced = true;
}

return new string(result);
return replaced ? buffer.ToString() : lowercase;
}

public static string RemoveDiacritics(string query)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.javatuples.Pair;

import java.math.BigDecimal;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
Expand All @@ -18,6 +19,15 @@ public abstract class RegExpUtility {
private static final String groupNameIndexSep = "iii";
private static final String groupNameIndexSepRegex = Pattern.quote(groupNameIndexSep);

private static final boolean unboundedLookBehindNotSupported = isRestrictedJavaVersion();

private static final Pattern lookBehindCheckRegex = Pattern.compile("(\\?<[!=][^)]+)([+*])");
private static final Map<String, String> bindings = new HashMap<String, String>(){{

put("+", "{1,10}");
put("*", "{0,10}");
}};

public static Pattern getSafeRegExp(String source) {
return getSafeRegExp(source, 0);
}
Expand Down Expand Up @@ -45,7 +55,7 @@ public static Map<String, String> getNamedGroups(Matcher groupedMatcher, boolean
groupName = groupName.replace("ii", "_");
}

//If matchedGroups previously contained a mapping for groupName, the old value is replaced.
// If matchedGroups previously contained a mapping for groupName, the old value is replaced.
if (groupValue != null) {
matchedGroups.put(groupName, groupValue);
}
Expand Down Expand Up @@ -131,8 +141,11 @@ public static Match[] getMatches(Pattern regex, String source) {
}

private static String sanitizeGroups(String source) {

String result = source;

AtomicInteger index = new AtomicInteger(0);
String result = replace(source, matchGroup, (Matcher m) -> m.group(0).replace(m.group(1), m.group(1).replace("_", "ii") + groupNameIndexSep + index.getAndIncrement()));
result = replace(result, matchGroup, (Matcher m) -> m.group(0).replace(m.group(1), m.group(1).replace("_", "ii") + groupNameIndexSep + index.getAndIncrement()));

index.set(0);
result = replace(result, matchPositiveLookbehind, (Matcher m) -> String.format("(?<plb%s%s>", groupNameIndexSep, index.getAndIncrement()));
Expand All @@ -143,6 +156,30 @@ private static String sanitizeGroups(String source) {
return result;
}

public static Pattern getSafeLookbehindRegExp(String source, int flags) {

String result = source;

// Java pre 1.9 doesn't support unbounded lookbehind lengths
if (unboundedLookBehindNotSupported) {
//result = bindLookbehinds(result);
}

return Pattern.compile(result, flags);
}

private static String bindLookbehinds(String regex) {

String result = regex;

Matcher matcher = lookBehindCheckRegex.matcher(regex);
if (matcher.find()) {
result = matcher.replaceAll(matcher.group(1) + bindings.get(matcher.group(2)));
}

return result;
}

private static Match[] getMatchesSimple(Pattern regex, String source) {
List<Match> matches = new ArrayList<>();

Expand Down Expand Up @@ -263,4 +300,33 @@ public static String replace(String input, Pattern regex, StringReplacerCallback

return resultString.toString();
}

// Checks if Java version is <= 8, as they don't support look-behind groups with no maximum length.
private static boolean isRestrictedJavaVersion() {

boolean result = false;
BigDecimal targetVersion = new BigDecimal( "1.8" );

try {
String specVersion = System.getProperty("java.specification.version");
result = new BigDecimal( specVersion ).compareTo( targetVersion ) >= 0;
} catch (Exception e) {
// Nothing to do, ignore.
}

try {
// Could also be "java.version", but runtime has more info.
String runtimeVersion = System.getProperty("java.runtime.version");
result = new BigDecimal( runtimeVersion ).compareTo( targetVersion ) >= 0;

} catch (Exception e) {
// Nothing to do, ignore.
}

if (result) {
System.out.println("WARN: Look-behind groups with no maximum length not supported. Java version <= 8.");
}

return result;
}
}
31 changes: 23 additions & 8 deletions Python/libraries/resource-generator/lib/code_writer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import abc, json, re
from .yaml_parser import SimpleRegex, NestedRegex, ParamsRegex, Dictionary, List


class CodeWriter:
def __init__(self, name):
self.name = name
Expand All @@ -9,42 +10,50 @@ def __init__(self, name):
def write(self):
pass


class DefaultWriter(CodeWriter):
def __init__(self, name, definition):
CodeWriter.__init__(self, name)
self.definition = sanitize(definition)

def write(self):
return f'{self.name} = \'{self.definition}\''


class SimpleRegexWriter(CodeWriter):
def __init__(self, name, definition):
CodeWriter.__init__(self, name)
self.definition = sanitize(definition)

def write(self):
return f'{self.name} = f\'{self.definition}\''


class NestedRegexWriter(SimpleRegexWriter):
def __init__(self, name, definition, references):
CodeWriter.__init__(self, name)
self.definition = sanitize(definition, None, references)


class ParamsRegexWriter(SimpleRegexWriter):
def __init__(self, name, definition, params):
CodeWriter.__init__(self, name)
self.definition = sanitize(definition, None, params)
self.params = ', '.join(params)

def write(self):
return f'{self.name} = lambda {self.params}: f\'{self.definition}\''


class DictionaryWriter(CodeWriter):
def __init__(self, name, key_type, value_type, entries):
CodeWriter.__init__(self, name)
self.entries = []
key_type = to_python_type(key_type)
value_type = to_python_type(value_type)

key_quote = '\'' if key_type=='string' else ''
value_quote = '\'' if value_type=='string' else ''
key_quote = '\'' if key_type == 'string' else ''
value_quote = '\'' if value_type == 'string' else ''
for key, value in entries.items():
k = key.replace(r"\'", '\'').replace('\'', r"\'")
if isinstance(value, list):
Expand All @@ -55,26 +64,30 @@ def __init__(self, name, key_type, value_type, entries):
self.entries.append(f'({key_quote}{k}{key_quote}, {value_quote}{v}{value_quote})')

def write(self):
spaces = ' ' * (len(f'{self.name} = dict([')+4)
spaces = ' ' * (len(f'{self.name} = dict([') + 4)
joined_entries = f',\n{spaces}'.join(self.entries)
return f'{self.name} = dict([{joined_entries}])'


class ArrayWriter(CodeWriter):
def __init__(self, name, value_type, entries):
CodeWriter.__init__(self, name)
self.entries = []
value_type = to_python_type(value_type)

value_quote = '\'' if value_type=='string' else ''
value_quote = '\'' if value_type == 'string' else ''

for value in entries:
value = value.replace('\'', '\\\'')
self.entries.append(f'{value_quote}{value}{value_quote}')

def write(self):
joined_entries = ', '.join(self.entries)
return f'{self.name} = [{joined_entries}]'

def sanitize(value: str, value_type = None, tokens = None):
value = value.replace('{','{{').replace('}','}}')

def sanitize(value: str, value_type=None, tokens=None):
value = value.replace('{', '{{').replace('}', '}}')
if tokens:
for token in tokens:
value = value.replace(f'{{{token}}}', token)
Expand All @@ -84,7 +97,8 @@ def sanitize(value: str, value_type = None, tokens = None):
except:
stringified = '"' + value + '"'

return stringified[1:len(stringified)-1].replace("'", r"\'")
return stringified[1:len(stringified) - 1].replace("'", r"\'")


def to_python_type(type_: str) -> str:
if type_ == 'long':
Expand All @@ -94,6 +108,7 @@ def to_python_type(type_: str) -> str:
else:
return type_


def generate_code(root):
lines = []
for token_name in root:
Expand Down

0 comments on commit e83d8e8

Please sign in to comment.