diff --git a/.gitignore b/.gitignore index 135dc4f..d786481 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ hs_err_pid* .idea *.iml + +# Output folder of IntelliJ +target diff --git a/pom.xml b/pom.xml index a124835..67b121a 100644 --- a/pom.xml +++ b/pom.xml @@ -210,4 +210,4 @@ - \ No newline at end of file + diff --git a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java index 1c9c2af..875dd05 100644 --- a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java +++ b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java @@ -19,9 +19,15 @@ */ public final class DateParser { + private static final int MAXIMUM_NUMBER_OF_ERRORS = 10; + private final ReMatcher matcher; private final DateBuilder dt = new DateBuilder(); + private ReMatcher limitedRulesMatcher = null; + private final boolean optimizeForReuseSimilarFormatted; + private int encounteredErrorsCounter = 0; + private final List rules; private final Set standardRules; private final Map customizedRuleMap; @@ -29,16 +35,17 @@ public final class DateParser { private String input; private boolean preferMonthFirst; - DateParser(List rules, Set stdRules, Map cstRules, boolean preferMonthFirst) { + DateParser(List rules, Set stdRules, Map cstRules, boolean preferMonthFirst, boolean optimizeForReuseSimilarFormatted) { this.rules = rules; this.standardRules = stdRules; this.customizedRuleMap = cstRules; this.preferMonthFirst = preferMonthFirst; this.matcher = new ReMatcher(this.rules.toArray(new String[0])); + this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted; } /** - * Create an new DateParserBuilder which could be used for initialize DateParser. + * Create a new DateParserBuilder which could be used for initialize DateParser. * * @return DateParserBuilder instance */ @@ -111,20 +118,73 @@ public OffsetDateTime parseOffsetDateTime(String str) { * Execute datetime's parsing */ private void parse(final CharArray input) { - matcher.reset(input); + // When the optimizeForReuseSimilarFormatted flag is set, we assume that the parser is + // used for multiple input strings in the same format + // * Remember which rules were used to parse the first input string + // * When parsing the second string, first try with the same rules as for the first input string + // * If this succeeds, we have a performance gain + // * If this fails, increment an error counter, and parse instead with all the rules + // * If the error counter passes a threshold, stop trying to parse input strings with the rules from the first string + // and fall back to the regular parsing code path that uses all the rules. + // The input strings were clearly not formatted in the same way + if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) { + if (limitedRulesMatcher != null) { + try { + parse(input, limitedRulesMatcher); + } catch (DateTimeParseException e) { + dt.reset(); + encounteredErrorsCounter++; + //Parsing with our subset of rules failed, so fall back to the matcher which uses all the rules + parse(input, matcher); + } + return; + } + //Find the rules that are needed to parse the input, and create a matcher with that subset of rules + matcher.reset(input); + int offset = 0; + int oldEnd = -1; + List reducedAllRules = new ArrayList<>(); + while (matcher.find(offset)) { + if (oldEnd == matcher.end()) { + encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; + parse(input, matcher); + return; + } + String usedRule = matcher.re(); + reducedAllRules.add(usedRule); + offset = matcher.end(); + oldEnd = offset; + } + if (offset != input.length()) { + encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; + parse(input, matcher); + return; + } + //At this point, we could parse the input meaning we found the relevant rules + //Store it for the next time + limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0])); + + parse(input, matcher); + } else { + parse(input, matcher); + } + } + + private void parse(final CharArray input, ReMatcher m) throws DateTimeParseException{ + m.reset(input); int offset = 0; int oldEnd = -1; - while (matcher.find(offset)) { - if (oldEnd == matcher.end()) { + while (m.find(offset)) { + if (oldEnd == m.end()) { throw error(offset, "empty matching at " + offset); } - if (standardRules.contains(matcher.re())) { - this.parseStandard(input, offset); + if (standardRules.contains(m.re())) { + this.parseStandard(input, offset, m); } else { - RuleHandler handler = customizedRuleMap.get(matcher.re()); - handler.handle(input, matcher, dt); + RuleHandler handler = customizedRuleMap.get(m.re()); + handler.handle(input, m, dt); } - offset = matcher.end(); + offset = m.end(); oldEnd = offset; } if (offset != input.length()) { @@ -135,13 +195,13 @@ private void parse(final CharArray input) { /** * Parse datetime use standard rules. */ - void parseStandard(CharArray input, int offset) { - for (int index = 1; index <= matcher.groupCount(); index++) { - final String groupName = matcher.groupName(index); - final int startOff = matcher.start(index); - final int endOff = matcher.end(index); + void parseStandard(CharArray input, int offset, ReMatcher m) { + for (int index = 1; index <= m.groupCount(); index++) { + final String groupName = m.groupName(index); + final int startOff = m.start(index); + final int endOff = m.end(index); if (groupName == null) { - throw error(offset, "Hit invalid standard rule: " + matcher.re()); + throw error(offset, "Hit invalid standard rule: " + m.re()); } if (startOff == -1 && endOff == -1) { continue; @@ -226,13 +286,13 @@ void parseStandard(CharArray input, int offset) { dt.ns = parseNum(input, endOff - 9, endOff); break; default: - throw error(offset, "Hit invalid standard rule: " + matcher.re()); + throw error(offset, "Hit invalid standard rule: " + m.re()); } } } /** - * Parse an subsequence which represent dd/mm or mm/dd, it should be more smart for different locales. + * Parse a subsequence which represent dd/mm or mm/dd, it should be more smart for different locales. */ void parseDayOrMonth(CharArray input, int from, int to) { char next = input.data[from + 1]; @@ -257,7 +317,7 @@ void parseDayOrMonth(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent year, like '2019', '19' etc + * Parse a subsequence which represent year, like '2019', '19' etc */ int parseYear(CharArray input, int from, int to) { switch (to - from) { @@ -274,7 +334,7 @@ int parseYear(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc + * Parse a subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc */ int parseZoneOffset(CharArray input, int from, int to) { boolean neg = input.data[from] == '-'; @@ -301,7 +361,7 @@ int parseZoneOffset(CharArray input, int from, int to) { } /** - * Parse an subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc + * Parse a subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc * It should be treated as ms/us/ns. */ int parseNano(CharArray input, int from, int to) { @@ -314,7 +374,7 @@ int parseNano(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent week, like 'Monday', 'mon' etc + * Parse a subsequence which represent week, like 'Monday', 'mon' etc */ int parseWeek(CharArray input, int from) { switch (input.data[from]) { @@ -345,7 +405,7 @@ int parseWeek(CharArray input, int from) { } /** - * Parse an subsequence which represent month, like '12', 'Feb' etc + * Parse a subsequence which represent month, like '12', 'Feb' etc */ int parseMonth(CharArray input, int from, int to) { if (to - from <= 2) { diff --git a/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java b/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java index 474e75e..91ce341 100644 --- a/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java +++ b/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java @@ -145,6 +145,8 @@ static synchronized void register(String re, RuleHandler handler) { } private boolean preferMonthFirst = false; + private boolean optimizeForReuseSimilarFormatted = false; + private final List rules = new ArrayList<>(); private final Set standardRules = new HashSet<>(); private final Map customizedRuleMap = new HashMap<>(); @@ -158,6 +160,18 @@ static synchronized void register(String re, RuleHandler handler) { this.customizedRuleMap.putAll(DateParserBuilder.CUSTOMIZED_RULE_MAP); } + /** + * Set to {@code true} when the parser will be used to parse many date strings which all use the same format. + * An example use-case is parsing a timestamp column from a large CSV file. + * + * @param optimizeForReuseSimilarFormatted True means creating a parser optimized to parse many date strings in the same format. + * @return This + */ + public DateParserBuilder optimizeForReuseSimilarFormatted(boolean optimizeForReuseSimilarFormatted){ + this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted; + return this; + } + /** * Mark this parser prefer mm/dd or not. * @@ -204,7 +218,7 @@ public DateParserBuilder addRule(String rule, RuleHandler handler) { * @return DateParser */ public DateParser build() { - return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst); + return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst, optimizeForReuseSimilarFormatted); } } diff --git a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java index 5694fb2..32e270d 100644 --- a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java +++ b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java @@ -4,8 +4,11 @@ import java.time.*; import java.util.Date; +import java.util.Random; import java.util.TimeZone; +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * @author sulin * @since 2019-09-14 16:48:50 @@ -127,4 +130,31 @@ public void testTimestamp() { assert date.getTime() == Long.valueOf(timestamp); } + @Test + public void testOptimizeForReuseSimilarFormatted(){ + Random random = new Random(123456789l); + String[] inputs = new String[500000]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + } + DateParser regular = DateParser.newBuilder().build(); + DateParser optimized = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build(); + + for (int i = 0; i < inputs.length; i++) { + String input = inputs[i]; + assertEquals(regular.parseDate(input), optimized.parseDate(input)); + } + + //Now check if the parser can still deal with a date in a different format + String inputInDifferentFormat = String.format("1%d/0%d/2020 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + assertEquals(regular.parseDate(inputInDifferentFormat), optimized.parseDate(inputInDifferentFormat)); + } } \ No newline at end of file diff --git a/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java b/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java new file mode 100644 index 0000000..8919f89 --- /dev/null +++ b/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java @@ -0,0 +1,44 @@ +package com.github.sisyphsu.dateparser.benchmark; + +import com.github.sisyphsu.dateparser.DateParser; +import org.openjdk.jmh.annotations.*; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 2, time = 2) +@BenchmarkMode(Mode.AverageTime) +@Fork(2) +@Measurement(iterations = 3, time = 3) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class OptimizeForReuseSimilarFormattedBenchmark { + private static final String[] TEXTS; + + static { + Random random = new Random(123456789l); + TEXTS = new String[500000]; + for (int i = 0; i < TEXTS.length; i++) { + TEXTS[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + } + } + + @Benchmark + public void regularParser() { + DateParser parser = DateParser.newBuilder().build(); + for (String text : TEXTS) { + parser.parseDate(text); + } + } + + @Benchmark + public void optimizedForReuseParser() { + DateParser parser = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build(); + for (String text : TEXTS) { + parser.parseDate(text); + } + } +}