Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance when parsing many strings in the same format #28

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@
hs_err_pid*
.idea
*.iml

# Output folder of IntelliJ
target
100 changes: 77 additions & 23 deletions src/main/java/com/github/sisyphsu/dateparser/DateParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,33 @@
*/
public final class DateParser {

private static final int MAXIMUM_NUMBER_OF_ERRORS = 10;

private final ReMatcher matcher;
private final DateBuilder dt = new DateBuilder();

private ReMatcher limitedRulesMatcher = null;
private final boolean optimizeForReuseSimilarFormatted;
private int encounteredErrorsCounter = 0;

private final List<String> rules;
private final Set<String> standardRules;
private final Map<String, RuleHandler> customizedRuleMap;

private String input;
private boolean preferMonthFirst;

DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst) {
DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst, boolean optimizeForReuseSimilarFormatted) {
this.rules = rules;
this.standardRules = stdRules;
this.customizedRuleMap = cstRules;
this.preferMonthFirst = preferMonthFirst;
this.matcher = new ReMatcher(this.rules.toArray(new String[0]));
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
}

/**
* Create an new DateParserBuilder which could be used for initialize DateParser.
* Create a new DateParserBuilder which could be used for initialize DateParser.
*
* @return DateParserBuilder instance
*/
Expand Down Expand Up @@ -111,20 +118,67 @@ public OffsetDateTime parseOffsetDateTime(String str) {
* Execute datetime's parsing
*/
private void parse(final CharArray input) {
matcher.reset(input);
if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) {
if (limitedRulesMatcher != null) {
//See if we can parse the input using the matcher which uses only a subset of the rules
try {
parse(input, limitedRulesMatcher);
return;
} catch (DateTimeParseException e) {
dt.reset();
encounteredErrorsCounter++;
}
} else {
//Find the rules that are needed to parse the input, and create a matcher with that subset of rules
matcher.reset(input);
int offset = 0;
int oldEnd = -1;
List<String> reducedAllRules = new ArrayList<>();
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
String usedRule = matcher.re();
if (standardRules.contains(usedRule)) {
reducedAllRules.add(usedRule);
} else {
reducedAllRules.add(usedRule);
}
offset = matcher.end();
oldEnd = offset;
}
if (offset != input.length()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
//At this point, we could parse the input meaning we found the relevant rules
//Store it for the next time
limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0]));
}
parse(input, matcher);
} else {
parse(input, matcher);
}
}

private void parse(final CharArray input, ReMatcher m) throws DateTimeParseException{
m.reset(input);
int offset = 0;
int oldEnd = -1;
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
while (m.find(offset)) {
if (oldEnd == m.end()) {
throw error(offset, "empty matching at " + offset);
}
if (standardRules.contains(matcher.re())) {
this.parseStandard(input, offset);
if (standardRules.contains(m.re())) {
this.parseStandard(input, offset, m);
} else {
RuleHandler handler = customizedRuleMap.get(matcher.re());
handler.handle(input, matcher, dt);
RuleHandler handler = customizedRuleMap.get(m.re());
handler.handle(input, m, dt);
}
offset = matcher.end();
offset = m.end();
oldEnd = offset;
}
if (offset != input.length()) {
Expand All @@ -135,13 +189,13 @@ private void parse(final CharArray input) {
/**
* Parse datetime use standard rules.
*/
void parseStandard(CharArray input, int offset) {
for (int index = 1; index <= matcher.groupCount(); index++) {
final String groupName = matcher.groupName(index);
final int startOff = matcher.start(index);
final int endOff = matcher.end(index);
void parseStandard(CharArray input, int offset, ReMatcher m) {
for (int index = 1; index <= m.groupCount(); index++) {
final String groupName = m.groupName(index);
final int startOff = m.start(index);
final int endOff = m.end(index);
if (groupName == null) {
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
if (startOff == -1 && endOff == -1) {
continue;
Expand Down Expand Up @@ -226,13 +280,13 @@ void parseStandard(CharArray input, int offset) {
dt.ns = parseNum(input, endOff - 9, endOff);
break;
default:
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
}
}

/**
* Parse an subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
* Parse a subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
*/
void parseDayOrMonth(CharArray input, int from, int to) {
char next = input.data[from + 1];
Expand All @@ -257,7 +311,7 @@ void parseDayOrMonth(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent year, like '2019', '19' etc
* Parse a subsequence which represent year, like '2019', '19' etc
*/
int parseYear(CharArray input, int from, int to) {
switch (to - from) {
Expand All @@ -274,7 +328,7 @@ int parseYear(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
* Parse a subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
*/
int parseZoneOffset(CharArray input, int from, int to) {
boolean neg = input.data[from] == '-';
Expand All @@ -301,7 +355,7 @@ int parseZoneOffset(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* Parse a subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* It should be treated as ms/us/ns.
*/
int parseNano(CharArray input, int from, int to) {
Expand All @@ -314,7 +368,7 @@ int parseNano(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent week, like 'Monday', 'mon' etc
* Parse a subsequence which represent week, like 'Monday', 'mon' etc
*/
int parseWeek(CharArray input, int from) {
switch (input.data[from]) {
Expand Down Expand Up @@ -345,7 +399,7 @@ int parseWeek(CharArray input, int from) {
}

/**
* Parse an subsequence which represent month, like '12', 'Feb' etc
* Parse a subsequence which represent month, like '12', 'Feb' etc
*/
int parseMonth(CharArray input, int from, int to) {
if (to - from <= 2) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ static synchronized void register(String re, RuleHandler handler) {
}

private boolean preferMonthFirst = false;
private boolean optimizeForReuseSimilarFormatted = false;

private final List<String> rules = new ArrayList<>();
private final Set<String> standardRules = new HashSet<>();
private final Map<String, RuleHandler> customizedRuleMap = new HashMap<>();
Expand All @@ -158,6 +160,18 @@ static synchronized void register(String re, RuleHandler handler) {
this.customizedRuleMap.putAll(DateParserBuilder.CUSTOMIZED_RULE_MAP);
}

/**
* Set to {@code true} when the parser will be used to parse many date strings which all use the same format.
* An example use-case is parsing a timestamp column from a large CSV file.
*
* @param optimizeForReuseSimilarFormatted True means creating a parser optimized to parse many date strings in the same format.
* @return This
*/
public DateParserBuilder optimizeForReuseSimilarFormatted(boolean optimizeForReuseSimilarFormatted){
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
return this;
}

/**
* Mark this parser prefer mm/dd or not.
*
Expand Down Expand Up @@ -204,7 +218,7 @@ public DateParserBuilder addRule(String rule, RuleHandler handler) {
* @return DateParser
*/
public DateParser build() {
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst);
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst, optimizeForReuseSimilarFormatted);
}

}
22 changes: 22 additions & 0 deletions src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

import java.time.*;
import java.util.Date;
import java.util.Random;
import java.util.TimeZone;

import static org.junit.jupiter.api.Assertions.assertEquals;

/**
* @author sulin
* @since 2019-09-14 16:48:50
Expand Down Expand Up @@ -127,4 +130,23 @@ public void testTimestamp() {
assert date.getTime() == Long.valueOf(timestamp);
}

@Test
public void testOptimizeForReuseSimilarFormatted(){
Random random = new Random(123456789l);
String[] inputs = new String[500000];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
DateParser regular = DateParser.newBuilder().build();
DateParser optimized = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();

for (int i = 0; i < inputs.length; i++) {
String input = inputs[i];
assertEquals(regular.parseDate(input), optimized.parseDate(input));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package com.github.sisyphsu.dateparser.benchmark;

import com.github.sisyphsu.dateparser.DateParser;
import org.openjdk.jmh.annotations.*;

import java.util.Random;
import java.util.concurrent.TimeUnit;

@Warmup(iterations = 2, time = 2)
@BenchmarkMode(Mode.AverageTime)
@Fork(2)
@Measurement(iterations = 3, time = 3)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class OptimizeForReuseSimilarFormattedBenchmark {
private static final String[] TEXTS;

static {
Random random = new Random(123456789l);
TEXTS = new String[500000];
for (int i = 0; i < TEXTS.length; i++) {
TEXTS[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
}

@Benchmark
public void regularParser() {
DateParser parser = DateParser.newBuilder().build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}

@Benchmark
public void optimizedForReuseParser() {
DateParser parser = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}
}