From 725049de5cefb525b585f3472843504296110658 Mon Sep 17 00:00:00 2001 From: Robin Stevens Date: Wed, 11 Jan 2023 15:07:48 +0100 Subject: [PATCH 1/4] Improve performance when parsing many strings in the same format --- .gitignore | 3 + .../sisyphsu/dateparser/DateParser.java | 100 ++++++++++++++---- .../dateparser/DateParserBuilder.java | 16 ++- .../sisyphsu/dateparser/DateBuilderTest.java | 22 ++++ ...mizeForReuseSimilarFormattedBenchmark.java | 44 ++++++++ 5 files changed, 161 insertions(+), 24 deletions(-) create mode 100644 src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java diff --git a/.gitignore b/.gitignore index 135dc4f..d786481 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ hs_err_pid* .idea *.iml + +# Output folder of IntelliJ +target diff --git a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java index 056dd1f..3fc3db2 100644 --- a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java +++ b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java @@ -19,9 +19,15 @@ */ public final class DateParser { + private static final int MAXIMUM_NUMBER_OF_ERRORS = 10; + private final ReMatcher matcher; private final DateBuilder dt = new DateBuilder(); + private ReMatcher limitedRulesMatcher = null; + private final boolean optimizeForReuseSimilarFormatted; + private int encounteredErrorsCounter = 0; + private final List rules; private final Set standardRules; private final Map customizedRuleMap; @@ -29,16 +35,17 @@ public final class DateParser { private String input; private boolean preferMonthFirst; - DateParser(List rules, Set stdRules, Map cstRules, boolean preferMonthFirst) { + DateParser(List rules, Set stdRules, Map cstRules, boolean preferMonthFirst, boolean optimizeForReuseSimilarFormatted) { this.rules = rules; this.standardRules = stdRules; this.customizedRuleMap = cstRules; this.preferMonthFirst = preferMonthFirst; this.matcher = new ReMatcher(this.rules.toArray(new String[0])); + this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted; } /** - * Create an new DateParserBuilder which could be used for initialize DateParser. + * Create a new DateParserBuilder which could be used for initialize DateParser. * * @return DateParserBuilder instance */ @@ -111,20 +118,67 @@ public OffsetDateTime parseOffsetDateTime(String str) { * Execute datetime's parsing */ private void parse(final CharArray input) { - matcher.reset(input); + if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) { + if (limitedRulesMatcher != null) { + //See if we can parse the input using the matcher which uses only a subset of the rules + try { + parse(input, limitedRulesMatcher); + return; + } catch (DateTimeParseException e) { + dt.reset(); + encounteredErrorsCounter++; + } + } else { + //Find the rules that are needed to parse the input, and create a matcher with that subset of rules + matcher.reset(input); + int offset = 0; + int oldEnd = -1; + List reducedAllRules = new ArrayList<>(); + while (matcher.find(offset)) { + if (oldEnd == matcher.end()) { + encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; + parse(input, matcher); + return; + } + String usedRule = matcher.re(); + if (standardRules.contains(usedRule)) { + reducedAllRules.add(usedRule); + } else { + reducedAllRules.add(usedRule); + } + offset = matcher.end(); + oldEnd = offset; + } + if (offset != input.length()) { + encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; + parse(input, matcher); + return; + } + //At this point, we could parse the input meaning we found the relevant rules + //Store it for the next time + limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0])); + } + parse(input, matcher); + } else { + parse(input, matcher); + } + } + + private void parse(final CharArray input, ReMatcher m) throws DateTimeParseException{ + m.reset(input); int offset = 0; int oldEnd = -1; - while (matcher.find(offset)) { - if (oldEnd == matcher.end()) { + while (m.find(offset)) { + if (oldEnd == m.end()) { throw error(offset, "empty matching at " + offset); } - if (standardRules.contains(matcher.re())) { - this.parseStandard(input, offset); + if (standardRules.contains(m.re())) { + this.parseStandard(input, offset, m); } else { - RuleHandler handler = customizedRuleMap.get(matcher.re()); - handler.handle(input, matcher, dt); + RuleHandler handler = customizedRuleMap.get(m.re()); + handler.handle(input, m, dt); } - offset = matcher.end(); + offset = m.end(); oldEnd = offset; } if (offset != input.length()) { @@ -135,13 +189,13 @@ private void parse(final CharArray input) { /** * Parse datetime use standard rules. */ - void parseStandard(CharArray input, int offset) { - for (int index = 1; index <= matcher.groupCount(); index++) { - final String groupName = matcher.groupName(index); - final int startOff = matcher.start(index); - final int endOff = matcher.end(index); + void parseStandard(CharArray input, int offset, ReMatcher m) { + for (int index = 1; index <= m.groupCount(); index++) { + final String groupName = m.groupName(index); + final int startOff = m.start(index); + final int endOff = m.end(index); if (groupName == null) { - throw error(offset, "Hit invalid standard rule: " + matcher.re()); + throw error(offset, "Hit invalid standard rule: " + m.re()); } if (startOff == -1 && endOff == -1) { continue; @@ -226,13 +280,13 @@ void parseStandard(CharArray input, int offset) { dt.ns = parseNum(input, endOff - 9, endOff); break; default: - throw error(offset, "Hit invalid standard rule: " + matcher.re()); + throw error(offset, "Hit invalid standard rule: " + m.re()); } } } /** - * Parse an subsequence which represent dd/mm or mm/dd, it should be more smart for different locales. + * Parse a subsequence which represent dd/mm or mm/dd, it should be more smart for different locales. */ void parseDayOrMonth(CharArray input, int from, int to) { char next = input.data[from + 1]; @@ -257,7 +311,7 @@ void parseDayOrMonth(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent year, like '2019', '19' etc + * Parse a subsequence which represent year, like '2019', '19' etc */ int parseYear(CharArray input, int from, int to) { switch (to - from) { @@ -274,7 +328,7 @@ int parseYear(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc + * Parse a subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc */ int parseZoneOffset(CharArray input, int from, int to) { boolean neg = input.data[from] == '-'; @@ -301,7 +355,7 @@ int parseZoneOffset(CharArray input, int from, int to) { } /** - * Parse an subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc + * Parse a subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc * It should be treated as ms/us/ns. */ int parseNano(CharArray input, int from, int to) { @@ -314,7 +368,7 @@ int parseNano(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent week, like 'Monday', 'mon' etc + * Parse a subsequence which represent week, like 'Monday', 'mon' etc */ int parseWeek(CharArray input, int from) { switch (input.data[from]) { @@ -345,7 +399,7 @@ int parseWeek(CharArray input, int from) { } /** - * Parse an subsequence which represent month, like '12', 'Feb' etc + * Parse a subsequence which represent month, like '12', 'Feb' etc */ int parseMonth(CharArray input, int from, int to) { if (to - from <= 2) { diff --git a/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java b/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java index 474e75e..91ce341 100644 --- a/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java +++ b/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java @@ -145,6 +145,8 @@ static synchronized void register(String re, RuleHandler handler) { } private boolean preferMonthFirst = false; + private boolean optimizeForReuseSimilarFormatted = false; + private final List rules = new ArrayList<>(); private final Set standardRules = new HashSet<>(); private final Map customizedRuleMap = new HashMap<>(); @@ -158,6 +160,18 @@ static synchronized void register(String re, RuleHandler handler) { this.customizedRuleMap.putAll(DateParserBuilder.CUSTOMIZED_RULE_MAP); } + /** + * Set to {@code true} when the parser will be used to parse many date strings which all use the same format. + * An example use-case is parsing a timestamp column from a large CSV file. + * + * @param optimizeForReuseSimilarFormatted True means creating a parser optimized to parse many date strings in the same format. + * @return This + */ + public DateParserBuilder optimizeForReuseSimilarFormatted(boolean optimizeForReuseSimilarFormatted){ + this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted; + return this; + } + /** * Mark this parser prefer mm/dd or not. * @@ -204,7 +218,7 @@ public DateParserBuilder addRule(String rule, RuleHandler handler) { * @return DateParser */ public DateParser build() { - return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst); + return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst, optimizeForReuseSimilarFormatted); } } diff --git a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java index 857e0ee..b695e16 100644 --- a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java +++ b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java @@ -4,8 +4,11 @@ import java.time.*; import java.util.Date; +import java.util.Random; import java.util.TimeZone; +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * @author sulin * @since 2019-09-14 16:48:50 @@ -122,4 +125,23 @@ public void testTimestamp() { assert date.getTime() == Long.valueOf(timestamp); } + @Test + public void testOptimizeForReuseSimilarFormatted(){ + Random random = new Random(123456789l); + String[] inputs = new String[500000]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + } + DateParser regular = DateParser.newBuilder().build(); + DateParser optimized = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build(); + + for (int i = 0; i < inputs.length; i++) { + String input = inputs[i]; + assertEquals(regular.parseDate(input), optimized.parseDate(input)); + } + } } \ No newline at end of file diff --git a/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java b/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java new file mode 100644 index 0000000..8919f89 --- /dev/null +++ b/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java @@ -0,0 +1,44 @@ +package com.github.sisyphsu.dateparser.benchmark; + +import com.github.sisyphsu.dateparser.DateParser; +import org.openjdk.jmh.annotations.*; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 2, time = 2) +@BenchmarkMode(Mode.AverageTime) +@Fork(2) +@Measurement(iterations = 3, time = 3) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class OptimizeForReuseSimilarFormattedBenchmark { + private static final String[] TEXTS; + + static { + Random random = new Random(123456789l); + TEXTS = new String[500000]; + for (int i = 0; i < TEXTS.length; i++) { + TEXTS[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + } + } + + @Benchmark + public void regularParser() { + DateParser parser = DateParser.newBuilder().build(); + for (String text : TEXTS) { + parser.parseDate(text); + } + } + + @Benchmark + public void optimizedForReuseParser() { + DateParser parser = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build(); + for (String text : TEXTS) { + parser.parseDate(text); + } + } +} From b40d10fdca13fa58d6d5a29be2d3426ba44dcdf5 Mon Sep 17 00:00:00 2001 From: Robin Stevens Date: Wed, 11 Jan 2023 15:07:48 +0100 Subject: [PATCH 2/4] Improve performance when parsing many strings in the same format --- .gitignore | 3 + .../sisyphsu/dateparser/DateParser.java | 100 ++++++++++++++---- .../dateparser/DateParserBuilder.java | 16 ++- .../sisyphsu/dateparser/DateBuilderTest.java | 22 ++++ ...mizeForReuseSimilarFormattedBenchmark.java | 44 ++++++++ 5 files changed, 161 insertions(+), 24 deletions(-) create mode 100644 src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java diff --git a/.gitignore b/.gitignore index 135dc4f..d786481 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ hs_err_pid* .idea *.iml + +# Output folder of IntelliJ +target diff --git a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java index 1c9c2af..d94158e 100644 --- a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java +++ b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java @@ -19,9 +19,15 @@ */ public final class DateParser { + private static final int MAXIMUM_NUMBER_OF_ERRORS = 10; + private final ReMatcher matcher; private final DateBuilder dt = new DateBuilder(); + private ReMatcher limitedRulesMatcher = null; + private final boolean optimizeForReuseSimilarFormatted; + private int encounteredErrorsCounter = 0; + private final List rules; private final Set standardRules; private final Map customizedRuleMap; @@ -29,16 +35,17 @@ public final class DateParser { private String input; private boolean preferMonthFirst; - DateParser(List rules, Set stdRules, Map cstRules, boolean preferMonthFirst) { + DateParser(List rules, Set stdRules, Map cstRules, boolean preferMonthFirst, boolean optimizeForReuseSimilarFormatted) { this.rules = rules; this.standardRules = stdRules; this.customizedRuleMap = cstRules; this.preferMonthFirst = preferMonthFirst; this.matcher = new ReMatcher(this.rules.toArray(new String[0])); + this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted; } /** - * Create an new DateParserBuilder which could be used for initialize DateParser. + * Create a new DateParserBuilder which could be used for initialize DateParser. * * @return DateParserBuilder instance */ @@ -111,20 +118,67 @@ public OffsetDateTime parseOffsetDateTime(String str) { * Execute datetime's parsing */ private void parse(final CharArray input) { - matcher.reset(input); + if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) { + if (limitedRulesMatcher != null) { + //See if we can parse the input using the matcher which uses only a subset of the rules + try { + parse(input, limitedRulesMatcher); + return; + } catch (DateTimeParseException e) { + dt.reset(); + encounteredErrorsCounter++; + } + } else { + //Find the rules that are needed to parse the input, and create a matcher with that subset of rules + matcher.reset(input); + int offset = 0; + int oldEnd = -1; + List reducedAllRules = new ArrayList<>(); + while (matcher.find(offset)) { + if (oldEnd == matcher.end()) { + encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; + parse(input, matcher); + return; + } + String usedRule = matcher.re(); + if (standardRules.contains(usedRule)) { + reducedAllRules.add(usedRule); + } else { + reducedAllRules.add(usedRule); + } + offset = matcher.end(); + oldEnd = offset; + } + if (offset != input.length()) { + encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; + parse(input, matcher); + return; + } + //At this point, we could parse the input meaning we found the relevant rules + //Store it for the next time + limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0])); + } + parse(input, matcher); + } else { + parse(input, matcher); + } + } + + private void parse(final CharArray input, ReMatcher m) throws DateTimeParseException{ + m.reset(input); int offset = 0; int oldEnd = -1; - while (matcher.find(offset)) { - if (oldEnd == matcher.end()) { + while (m.find(offset)) { + if (oldEnd == m.end()) { throw error(offset, "empty matching at " + offset); } - if (standardRules.contains(matcher.re())) { - this.parseStandard(input, offset); + if (standardRules.contains(m.re())) { + this.parseStandard(input, offset, m); } else { - RuleHandler handler = customizedRuleMap.get(matcher.re()); - handler.handle(input, matcher, dt); + RuleHandler handler = customizedRuleMap.get(m.re()); + handler.handle(input, m, dt); } - offset = matcher.end(); + offset = m.end(); oldEnd = offset; } if (offset != input.length()) { @@ -135,13 +189,13 @@ private void parse(final CharArray input) { /** * Parse datetime use standard rules. */ - void parseStandard(CharArray input, int offset) { - for (int index = 1; index <= matcher.groupCount(); index++) { - final String groupName = matcher.groupName(index); - final int startOff = matcher.start(index); - final int endOff = matcher.end(index); + void parseStandard(CharArray input, int offset, ReMatcher m) { + for (int index = 1; index <= m.groupCount(); index++) { + final String groupName = m.groupName(index); + final int startOff = m.start(index); + final int endOff = m.end(index); if (groupName == null) { - throw error(offset, "Hit invalid standard rule: " + matcher.re()); + throw error(offset, "Hit invalid standard rule: " + m.re()); } if (startOff == -1 && endOff == -1) { continue; @@ -226,13 +280,13 @@ void parseStandard(CharArray input, int offset) { dt.ns = parseNum(input, endOff - 9, endOff); break; default: - throw error(offset, "Hit invalid standard rule: " + matcher.re()); + throw error(offset, "Hit invalid standard rule: " + m.re()); } } } /** - * Parse an subsequence which represent dd/mm or mm/dd, it should be more smart for different locales. + * Parse a subsequence which represent dd/mm or mm/dd, it should be more smart for different locales. */ void parseDayOrMonth(CharArray input, int from, int to) { char next = input.data[from + 1]; @@ -257,7 +311,7 @@ void parseDayOrMonth(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent year, like '2019', '19' etc + * Parse a subsequence which represent year, like '2019', '19' etc */ int parseYear(CharArray input, int from, int to) { switch (to - from) { @@ -274,7 +328,7 @@ int parseYear(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc + * Parse a subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc */ int parseZoneOffset(CharArray input, int from, int to) { boolean neg = input.data[from] == '-'; @@ -301,7 +355,7 @@ int parseZoneOffset(CharArray input, int from, int to) { } /** - * Parse an subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc + * Parse a subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc * It should be treated as ms/us/ns. */ int parseNano(CharArray input, int from, int to) { @@ -314,7 +368,7 @@ int parseNano(CharArray input, int from, int to) { } /** - * Parse an subsequence which represent week, like 'Monday', 'mon' etc + * Parse a subsequence which represent week, like 'Monday', 'mon' etc */ int parseWeek(CharArray input, int from) { switch (input.data[from]) { @@ -345,7 +399,7 @@ int parseWeek(CharArray input, int from) { } /** - * Parse an subsequence which represent month, like '12', 'Feb' etc + * Parse a subsequence which represent month, like '12', 'Feb' etc */ int parseMonth(CharArray input, int from, int to) { if (to - from <= 2) { diff --git a/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java b/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java index 474e75e..91ce341 100644 --- a/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java +++ b/src/main/java/com/github/sisyphsu/dateparser/DateParserBuilder.java @@ -145,6 +145,8 @@ static synchronized void register(String re, RuleHandler handler) { } private boolean preferMonthFirst = false; + private boolean optimizeForReuseSimilarFormatted = false; + private final List rules = new ArrayList<>(); private final Set standardRules = new HashSet<>(); private final Map customizedRuleMap = new HashMap<>(); @@ -158,6 +160,18 @@ static synchronized void register(String re, RuleHandler handler) { this.customizedRuleMap.putAll(DateParserBuilder.CUSTOMIZED_RULE_MAP); } + /** + * Set to {@code true} when the parser will be used to parse many date strings which all use the same format. + * An example use-case is parsing a timestamp column from a large CSV file. + * + * @param optimizeForReuseSimilarFormatted True means creating a parser optimized to parse many date strings in the same format. + * @return This + */ + public DateParserBuilder optimizeForReuseSimilarFormatted(boolean optimizeForReuseSimilarFormatted){ + this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted; + return this; + } + /** * Mark this parser prefer mm/dd or not. * @@ -204,7 +218,7 @@ public DateParserBuilder addRule(String rule, RuleHandler handler) { * @return DateParser */ public DateParser build() { - return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst); + return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst, optimizeForReuseSimilarFormatted); } } diff --git a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java index 5694fb2..1f26c68 100644 --- a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java +++ b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java @@ -4,8 +4,11 @@ import java.time.*; import java.util.Date; +import java.util.Random; import java.util.TimeZone; +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * @author sulin * @since 2019-09-14 16:48:50 @@ -127,4 +130,23 @@ public void testTimestamp() { assert date.getTime() == Long.valueOf(timestamp); } + @Test + public void testOptimizeForReuseSimilarFormatted(){ + Random random = new Random(123456789l); + String[] inputs = new String[500000]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + } + DateParser regular = DateParser.newBuilder().build(); + DateParser optimized = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build(); + + for (int i = 0; i < inputs.length; i++) { + String input = inputs[i]; + assertEquals(regular.parseDate(input), optimized.parseDate(input)); + } + } } \ No newline at end of file diff --git a/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java b/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java new file mode 100644 index 0000000..8919f89 --- /dev/null +++ b/src/test/java/com/github/sisyphsu/dateparser/benchmark/OptimizeForReuseSimilarFormattedBenchmark.java @@ -0,0 +1,44 @@ +package com.github.sisyphsu.dateparser.benchmark; + +import com.github.sisyphsu.dateparser.DateParser; +import org.openjdk.jmh.annotations.*; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 2, time = 2) +@BenchmarkMode(Mode.AverageTime) +@Fork(2) +@Measurement(iterations = 3, time = 3) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class OptimizeForReuseSimilarFormattedBenchmark { + private static final String[] TEXTS; + + static { + Random random = new Random(123456789l); + TEXTS = new String[500000]; + for (int i = 0; i < TEXTS.length; i++) { + TEXTS[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + } + } + + @Benchmark + public void regularParser() { + DateParser parser = DateParser.newBuilder().build(); + for (String text : TEXTS) { + parser.parseDate(text); + } + } + + @Benchmark + public void optimizedForReuseParser() { + DateParser parser = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build(); + for (String text : TEXTS) { + parser.parseDate(text); + } + } +} From 84fd1e3c5ddfd8bccbef5cf14f61b9c612e31acb Mon Sep 17 00:00:00 2001 From: Robin Stevens Date: Fri, 17 Feb 2023 16:44:47 +0100 Subject: [PATCH 3/4] Clarified the code some more * Some additional comments * Some small tweaks to the code in an attempt to improve readability * Extended the test case a bit --- pom.xml | 4 +- .../sisyphsu/dateparser/DateParser.java | 60 ++++++++++--------- .../sisyphsu/dateparser/DateBuilderTest.java | 8 +++ 3 files changed, 43 insertions(+), 29 deletions(-) diff --git a/pom.xml b/pom.xml index a124835..f049f9c 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 com.github.sisyphsu - dateparser + dateparser-xyzt-ai 1.0.11 dateparser @@ -210,4 +210,4 @@ - \ No newline at end of file + diff --git a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java index d94158e..875dd05 100644 --- a/src/main/java/com/github/sisyphsu/dateparser/DateParser.java +++ b/src/main/java/com/github/sisyphsu/dateparser/DateParser.java @@ -118,46 +118,52 @@ public OffsetDateTime parseOffsetDateTime(String str) { * Execute datetime's parsing */ private void parse(final CharArray input) { + // When the optimizeForReuseSimilarFormatted flag is set, we assume that the parser is + // used for multiple input strings in the same format + // * Remember which rules were used to parse the first input string + // * When parsing the second string, first try with the same rules as for the first input string + // * If this succeeds, we have a performance gain + // * If this fails, increment an error counter, and parse instead with all the rules + // * If the error counter passes a threshold, stop trying to parse input strings with the rules from the first string + // and fall back to the regular parsing code path that uses all the rules. + // The input strings were clearly not formatted in the same way if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) { if (limitedRulesMatcher != null) { - //See if we can parse the input using the matcher which uses only a subset of the rules try { parse(input, limitedRulesMatcher); - return; } catch (DateTimeParseException e) { dt.reset(); encounteredErrorsCounter++; + //Parsing with our subset of rules failed, so fall back to the matcher which uses all the rules + parse(input, matcher); } - } else { - //Find the rules that are needed to parse the input, and create a matcher with that subset of rules - matcher.reset(input); - int offset = 0; - int oldEnd = -1; - List reducedAllRules = new ArrayList<>(); - while (matcher.find(offset)) { - if (oldEnd == matcher.end()) { - encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; - parse(input, matcher); - return; - } - String usedRule = matcher.re(); - if (standardRules.contains(usedRule)) { - reducedAllRules.add(usedRule); - } else { - reducedAllRules.add(usedRule); - } - offset = matcher.end(); - oldEnd = offset; - } - if (offset != input.length()) { + return; + } + //Find the rules that are needed to parse the input, and create a matcher with that subset of rules + matcher.reset(input); + int offset = 0; + int oldEnd = -1; + List reducedAllRules = new ArrayList<>(); + while (matcher.find(offset)) { + if (oldEnd == matcher.end()) { encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; parse(input, matcher); return; } - //At this point, we could parse the input meaning we found the relevant rules - //Store it for the next time - limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0])); + String usedRule = matcher.re(); + reducedAllRules.add(usedRule); + offset = matcher.end(); + oldEnd = offset; } + if (offset != input.length()) { + encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS; + parse(input, matcher); + return; + } + //At this point, we could parse the input meaning we found the relevant rules + //Store it for the next time + limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0])); + parse(input, matcher); } else { parse(input, matcher); diff --git a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java index 1f26c68..32e270d 100644 --- a/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java +++ b/src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java @@ -148,5 +148,13 @@ public void testOptimizeForReuseSimilarFormatted(){ String input = inputs[i]; assertEquals(regular.parseDate(input), optimized.parseDate(input)); } + + //Now check if the parser can still deal with a date in a different format + String inputInDifferentFormat = String.format("1%d/0%d/2020 00:%d%d:00 UTC", + random.nextInt(8) + 1, + random.nextInt(8) + 1, + random.nextInt(5), + random.nextInt(9)); + assertEquals(regular.parseDate(inputInDifferentFormat), optimized.parseDate(inputInDifferentFormat)); } } \ No newline at end of file From 8a9da3b92650f1c47d32e9737fb45ef1e85ed3ae Mon Sep 17 00:00:00 2001 From: Robin Stevens Date: Fri, 17 Feb 2023 16:50:33 +0100 Subject: [PATCH 4/4] Restored the pom.xml to the original version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index f049f9c..67b121a 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 com.github.sisyphsu - dateparser-xyzt-ai + dateparser 1.0.11 dateparser