Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance when parsing many strings in the same format #28

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@
hs_err_pid*
.idea
*.iml

# Output folder of IntelliJ
target
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -210,4 +210,4 @@
</resources>
</build>

</project>
</project>
106 changes: 83 additions & 23 deletions src/main/java/com/github/sisyphsu/dateparser/DateParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,33 @@
*/
public final class DateParser {

private static final int MAXIMUM_NUMBER_OF_ERRORS = 10;

private final ReMatcher matcher;
private final DateBuilder dt = new DateBuilder();

private ReMatcher limitedRulesMatcher = null;
private final boolean optimizeForReuseSimilarFormatted;
private int encounteredErrorsCounter = 0;

private final List<String> rules;
private final Set<String> standardRules;
private final Map<String, RuleHandler> customizedRuleMap;

private String input;
private boolean preferMonthFirst;

DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst) {
DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst, boolean optimizeForReuseSimilarFormatted) {
this.rules = rules;
this.standardRules = stdRules;
this.customizedRuleMap = cstRules;
this.preferMonthFirst = preferMonthFirst;
this.matcher = new ReMatcher(this.rules.toArray(new String[0]));
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
}

/**
* Create an new DateParserBuilder which could be used for initialize DateParser.
* Create a new DateParserBuilder which could be used for initialize DateParser.
*
* @return DateParserBuilder instance
*/
Expand Down Expand Up @@ -111,20 +118,73 @@ public OffsetDateTime parseOffsetDateTime(String str) {
* Execute datetime's parsing
*/
private void parse(final CharArray input) {
matcher.reset(input);
// When the optimizeForReuseSimilarFormatted flag is set, we assume that the parser is
// used for multiple input strings in the same format
// * Remember which rules were used to parse the first input string
// * When parsing the second string, first try with the same rules as for the first input string
// * If this succeeds, we have a performance gain
// * If this fails, increment an error counter, and parse instead with all the rules
// * If the error counter passes a threshold, stop trying to parse input strings with the rules from the first string
// and fall back to the regular parsing code path that uses all the rules.
// The input strings were clearly not formatted in the same way
if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) {
if (limitedRulesMatcher != null) {
try {
parse(input, limitedRulesMatcher);
} catch (DateTimeParseException e) {
dt.reset();
encounteredErrorsCounter++;
//Parsing with our subset of rules failed, so fall back to the matcher which uses all the rules
parse(input, matcher);
}
return;
}
//Find the rules that are needed to parse the input, and create a matcher with that subset of rules
matcher.reset(input);
int offset = 0;
int oldEnd = -1;
List<String> reducedAllRules = new ArrayList<>();
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
String usedRule = matcher.re();
reducedAllRules.add(usedRule);
offset = matcher.end();
oldEnd = offset;
}
if (offset != input.length()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
//At this point, we could parse the input meaning we found the relevant rules
//Store it for the next time
limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0]));

parse(input, matcher);
} else {
parse(input, matcher);
}
}

private void parse(final CharArray input, ReMatcher m) throws DateTimeParseException{
m.reset(input);
int offset = 0;
int oldEnd = -1;
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
while (m.find(offset)) {
if (oldEnd == m.end()) {
throw error(offset, "empty matching at " + offset);
}
if (standardRules.contains(matcher.re())) {
this.parseStandard(input, offset);
if (standardRules.contains(m.re())) {
this.parseStandard(input, offset, m);
} else {
RuleHandler handler = customizedRuleMap.get(matcher.re());
handler.handle(input, matcher, dt);
RuleHandler handler = customizedRuleMap.get(m.re());
handler.handle(input, m, dt);
}
offset = matcher.end();
offset = m.end();
oldEnd = offset;
}
if (offset != input.length()) {
Expand All @@ -135,13 +195,13 @@ private void parse(final CharArray input) {
/**
* Parse datetime use standard rules.
*/
void parseStandard(CharArray input, int offset) {
for (int index = 1; index <= matcher.groupCount(); index++) {
final String groupName = matcher.groupName(index);
final int startOff = matcher.start(index);
final int endOff = matcher.end(index);
void parseStandard(CharArray input, int offset, ReMatcher m) {
for (int index = 1; index <= m.groupCount(); index++) {
final String groupName = m.groupName(index);
final int startOff = m.start(index);
final int endOff = m.end(index);
if (groupName == null) {
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
if (startOff == -1 && endOff == -1) {
continue;
Expand Down Expand Up @@ -226,13 +286,13 @@ void parseStandard(CharArray input, int offset) {
dt.ns = parseNum(input, endOff - 9, endOff);
break;
default:
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
}
}

/**
* Parse an subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
* Parse a subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
*/
void parseDayOrMonth(CharArray input, int from, int to) {
char next = input.data[from + 1];
Expand All @@ -257,7 +317,7 @@ void parseDayOrMonth(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent year, like '2019', '19' etc
* Parse a subsequence which represent year, like '2019', '19' etc
*/
int parseYear(CharArray input, int from, int to) {
switch (to - from) {
Expand All @@ -274,7 +334,7 @@ int parseYear(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
* Parse a subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
*/
int parseZoneOffset(CharArray input, int from, int to) {
boolean neg = input.data[from] == '-';
Expand All @@ -301,7 +361,7 @@ int parseZoneOffset(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* Parse a subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* It should be treated as ms/us/ns.
*/
int parseNano(CharArray input, int from, int to) {
Expand All @@ -314,7 +374,7 @@ int parseNano(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent week, like 'Monday', 'mon' etc
* Parse a subsequence which represent week, like 'Monday', 'mon' etc
*/
int parseWeek(CharArray input, int from) {
switch (input.data[from]) {
Expand Down Expand Up @@ -345,7 +405,7 @@ int parseWeek(CharArray input, int from) {
}

/**
* Parse an subsequence which represent month, like '12', 'Feb' etc
* Parse a subsequence which represent month, like '12', 'Feb' etc
*/
int parseMonth(CharArray input, int from, int to) {
if (to - from <= 2) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ static synchronized void register(String re, RuleHandler handler) {
}

private boolean preferMonthFirst = false;
private boolean optimizeForReuseSimilarFormatted = false;

private final List<String> rules = new ArrayList<>();
private final Set<String> standardRules = new HashSet<>();
private final Map<String, RuleHandler> customizedRuleMap = new HashMap<>();
Expand All @@ -158,6 +160,18 @@ static synchronized void register(String re, RuleHandler handler) {
this.customizedRuleMap.putAll(DateParserBuilder.CUSTOMIZED_RULE_MAP);
}

/**
* Set to {@code true} when the parser will be used to parse many date strings which all use the same format.
* An example use-case is parsing a timestamp column from a large CSV file.
*
* @param optimizeForReuseSimilarFormatted True means creating a parser optimized to parse many date strings in the same format.
* @return This
*/
public DateParserBuilder optimizeForReuseSimilarFormatted(boolean optimizeForReuseSimilarFormatted){
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
return this;
}

/**
* Mark this parser prefer mm/dd or not.
*
Expand Down Expand Up @@ -204,7 +218,7 @@ public DateParserBuilder addRule(String rule, RuleHandler handler) {
* @return DateParser
*/
public DateParser build() {
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst);
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst, optimizeForReuseSimilarFormatted);
}

}
30 changes: 30 additions & 0 deletions src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

import java.time.*;
import java.util.Date;
import java.util.Random;
import java.util.TimeZone;

import static org.junit.jupiter.api.Assertions.assertEquals;

/**
* @author sulin
* @since 2019-09-14 16:48:50
Expand Down Expand Up @@ -127,4 +130,31 @@ public void testTimestamp() {
assert date.getTime() == Long.valueOf(timestamp);
}

@Test
public void testOptimizeForReuseSimilarFormatted(){
Random random = new Random(123456789l);
String[] inputs = new String[500000];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
DateParser regular = DateParser.newBuilder().build();
DateParser optimized = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();

for (int i = 0; i < inputs.length; i++) {
String input = inputs[i];
assertEquals(regular.parseDate(input), optimized.parseDate(input));
}

//Now check if the parser can still deal with a date in a different format
String inputInDifferentFormat = String.format("1%d/0%d/2020 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
assertEquals(regular.parseDate(inputInDifferentFormat), optimized.parseDate(inputInDifferentFormat));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package com.github.sisyphsu.dateparser.benchmark;

import com.github.sisyphsu.dateparser.DateParser;
import org.openjdk.jmh.annotations.*;

import java.util.Random;
import java.util.concurrent.TimeUnit;

@Warmup(iterations = 2, time = 2)
@BenchmarkMode(Mode.AverageTime)
@Fork(2)
@Measurement(iterations = 3, time = 3)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class OptimizeForReuseSimilarFormattedBenchmark {
private static final String[] TEXTS;

static {
Random random = new Random(123456789l);
TEXTS = new String[500000];
for (int i = 0; i < TEXTS.length; i++) {
TEXTS[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
}

@Benchmark
public void regularParser() {
DateParser parser = DateParser.newBuilder().build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}

@Benchmark
public void optimizedForReuseParser() {
DateParser parser = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}
}