Skip to content

Commit 81d7b61

Browse files
Merge pull request #23 from intuit/linear-cost-optimization
Linear cost optimization
2 parents 6a81cf3 + 316f95f commit 81d7b61

38 files changed

+1049
-1436
lines changed

CHANGELOG.md

+21
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,27 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## 1.0.0 - 2020-03-25
8+
### Added
9+
- In Element ability to set MatchType (this replaces similarityMatchFunction)
10+
- In Element ability to set NeighborhoodRange for NEAREST_NEIGHBOR MatchType
11+
- New classes added - TokenRepo (replaces the TokenMatch) and MatchType (replaces SimilariyyMatchFunction)
12+
13+
### Removed
14+
- Document and Element classes does not allow to externally define a ScoringFunction. This is now set to defaults as
15+
SimpleAverage (in Element) and ExponentialWeightedAverage (in Document)
16+
- Element does not allow to externally define similarityMatchFunction. This is replaced by MatchType
17+
- Element does not allow to externally define matchOptimizerFunction. All these changes allow a guaranteed performance of the library
18+
- These classes are removed - TokenMatch, NGram, MatchOptimizerFunction, SimilarityMatchFunction
19+
20+
### Changed
21+
- Significant performance improvements along with reduced memory utilization
22+
- Soundex match is no longer a Matching function, it is replaced as a tokenization function instead, where encoded soundex token are now used.
23+
- Element is a generic now. Which replaces the `value` as generic instead of object.
24+
- ElementType of TEXT is matched by word equality instead of Soundex matching function by default
25+
- ElementType of NUMBER and DATE are matched using NEAREST_NEIGHBOR MatchType. This gives similar results, but are
26+
controlled by NeighborhoodRange attribute defined in Element instead of Threshold
27+
728
## 0.4.4 - 2019-12-23
829
### Fixed
930
- Ability to configure scoring function in Element https://github.com/intuit/fuzzy-matcher/issues/19

CONTRIBUTING.md

+18
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,24 @@ Thanks for your interest in fuzzy-matcher.
66

77
Readme.md file gives a good overview of the architecture. Make sure to review the readme.
88

9+
## Building the Library
10+
### Prerequisite
11+
You need Java SDK v1.8 or higher. Before you begin, you should check your current Java installation by using the following command:
12+
``` java -version ```
13+
14+
fuzzy-match is compatible with Apache Maven 4.0 or above. If you do not already have Maven installed, you can follow the instructions at maven.apache.org.
15+
```
16+
On many operating systems, Maven can be installed with a package manager.
17+
If you use OSX Homebrew, try brew install maven.
18+
Ubuntu users can run sudo apt-get install maven.
19+
Windows users with Chocolatey can run choco install maven from an elevated (administrator) prompt.
20+
```
21+
### Compiling and installing locally
22+
After cloning the project locally, run this command to compile, test and install the project
23+
```
24+
mvn clean install
25+
```
26+
927
## Contributions
1028

1129
fuzzy-matcher welcomes contributions from everyone.

README.md

+188-182
Large diffs are not rendered by default.

fuzzy-match.png

-8.49 KB
Loading

perf.png

965 Bytes
Loading

pom.xml

-7
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,6 @@
7979
<version>1.11</version>
8080
</dependency>
8181

82-
<!-- https://mvnrepository.com/artifact/com.googlecode.libphonenumber/libphonenumber -->
83-
<dependency>
84-
<groupId>com.googlecode.libphonenumber</groupId>
85-
<artifactId>libphonenumber</artifactId>
86-
<version>3.5</version>
87-
</dependency>
88-
8982
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
9083
<dependency>
9184
<groupId>org.apache.lucene</groupId>

src/main/java/com/intuit/fuzzymatcher/component/DocumentMatch.java

+40-31
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,18 @@
88
import java.util.stream.Collectors;
99
import java.util.stream.Stream;
1010

11-
1211
/**
1312
* <p>
1413
* Starts the Matching process by element level matching and aggregates the results back
1514
* This uses the ScoringFunction defined at each Document to get the aggregated Document score for matched Elements
1615
*/
1716
public class DocumentMatch {
1817

19-
private static ElementMatch elementMatch = new ElementMatch();
18+
private final ElementMatch elementMatch;
19+
20+
public DocumentMatch() {
21+
this.elementMatch = new ElementMatch();
22+
}
2023

2124
/**
2225
* Executes matching of a document stream
@@ -25,39 +28,45 @@ public class DocumentMatch {
2528
* @return Stream of Match of Document type objects
2629
*/
2730
public Stream<Match<Document>> matchDocuments(Stream<Document> documents) {
28-
Stream<Element> elements = documents.flatMap(d -> d.getPreProcessedElement().stream());
29-
Map<ElementClassification, List<Element>> elementMap = elements.collect(Collectors.groupingBy(Element::getElementClassification));
3031

31-
List<Match<Element>> matchedElements = new ArrayList<>();
32-
elementMap.forEach((key, value) -> {
33-
List<Match<Element>> result = elementMatch.matchElements(key, value.parallelStream()).collect(Collectors.toList());
34-
matchedElements.addAll(result);
32+
Stream<Match<Document>> documentMatch = documents.flatMap(document -> {
33+
Set<Element> elements = document.getPreProcessedElement();
34+
Set<Match<Element>> eleMatches = elements.stream()
35+
.flatMap(element -> elementMatch.matchElement(element).stream())
36+
.collect(Collectors.toSet());
37+
return documentThresholdMatching(document, eleMatches);
3538
});
3639

37-
return rollupDocumentScore(matchedElements.parallelStream());
40+
return documentMatch;
3841
}
3942

40-
private Stream<Match<Document>> rollupDocumentScore(Stream<Match<Element>> matchElementStream) {
41-
42-
Map<Document, Map<Document, List<Match<Element>>>> groupBy = matchElementStream
43-
.collect(Collectors.groupingBy(matchElement -> matchElement.getData().getDocument(),
44-
Collectors.groupingBy(matchElement -> matchElement.getMatchedWith().getDocument())));
45-
46-
return groupBy.entrySet().parallelStream().flatMap(leftDocumentEntry ->
47-
leftDocumentEntry.getValue().entrySet()
48-
.parallelStream()
49-
.flatMap(rightDocumentEntry -> {
50-
List<Score> childScoreList = rightDocumentEntry.getValue()
51-
.stream()
52-
.map(d -> d.getScore())
53-
.collect(Collectors.toList());
54-
Match<Document> leftMatch = new Match<Document>(leftDocumentEntry.getKey(), rightDocumentEntry.getKey(), childScoreList);
55-
if (BooleanUtils.isNotFalse(rightDocumentEntry.getKey().isSource())) {
56-
Match<Document> rightMatch = new Match<Document>(rightDocumentEntry.getKey(), leftDocumentEntry.getKey(), childScoreList);
57-
return Stream.of(leftMatch, rightMatch);
58-
}
59-
return Stream.of(leftMatch);
60-
}))
61-
.filter(match -> match.getResult() > match.getData().getThreshold());
43+
private Stream<Match<Document>> documentThresholdMatching(Document document, Set<Match<Element>> matchingElements) {
44+
Map<Document, List<Match<Element>>> mathes = matchingElements.stream()
45+
.collect(Collectors.groupingBy(matchElement -> matchElement.getMatchedWith().getDocument()));
46+
47+
Stream<Match<Document>> result = mathes.entrySet().stream().flatMap(matchEntry -> {
48+
49+
List<Score> childScoreList = matchEntry.getValue()
50+
.stream()
51+
.map(d -> d.getScore())
52+
.collect(Collectors.toList());
53+
//System.out.println(Arrays.toString(childScoreList.toArray()));
54+
Match<Document> leftMatch = new Match<Document>(document, matchEntry.getKey(), childScoreList);
55+
56+
// Document match Found
57+
if (leftMatch.getScore().getResult() > leftMatch.getData().getThreshold()) {
58+
59+
if (BooleanUtils.isNotFalse(matchEntry.getKey().isSource())) {
60+
Match<Document> rightMatch = new Match<Document>(matchEntry.getKey(), document, childScoreList);
61+
return Stream.of(leftMatch, rightMatch);
62+
}
63+
return Stream.of(leftMatch);
64+
} else {
65+
return Stream.empty();
66+
}
67+
});
68+
69+
return result;
6270
}
71+
6372
}
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,55 @@
11
package com.intuit.fuzzymatcher.component;
22

3-
import com.intuit.fuzzymatcher.domain.*;
3+
import com.intuit.fuzzymatcher.domain.Element;
4+
import com.intuit.fuzzymatcher.domain.Match;
5+
import com.intuit.fuzzymatcher.domain.Token;
6+
import org.apache.commons.lang3.BooleanUtils;
47

5-
import java.util.List;
6-
import java.util.Map;
7-
import java.util.stream.Collectors;
8-
import java.util.stream.Stream;
8+
import java.util.*;
99

10-
/**
11-
* Matches at element level with aggregated results from token.
12-
* This uses the ScoringFunction defined at each element to get the aggregated Element score for matched tokens
13-
*/
1410
public class ElementMatch {
1511

16-
private static TokenMatch tokenMatch = new TokenMatch();
12+
private final TokenRepo tokenRepo;
1713

18-
public Stream<Match<Element>> matchElements(ElementClassification elementClassification, Stream<Element> elements) {
19-
Stream<Token> tokenStream = elements.flatMap(Element::getTokens);
20-
Stream<Match<Token>> matchedTokens = tokenMatch.matchTokens(elementClassification, tokenStream);
21-
return rollupElementScore(matchedTokens);
14+
public ElementMatch() {
15+
this.tokenRepo = new TokenRepo();
2216
}
2317

24-
private Stream<Match<Element>> rollupElementScore(Stream<Match<Token>> matchedTokenStream) {
18+
public Set<Match<Element>> matchElement(Element element) {
19+
Set<Match<Element>> matchElements = new HashSet<>();
20+
Map<Element, Integer> elementTokenScore = new HashMap<>();
2521

26-
Map<Element, Map<Element, List<Match<Token>>>> groupBy = matchedTokenStream
27-
.collect(Collectors.groupingBy((matchToken -> matchToken.getData().getElement()),
28-
Collectors.groupingBy(matchToken -> matchToken.getMatchedWith().getElement())));
22+
List<Token> tokens = element.getTokens();
23+
tokens.stream()
24+
.filter(token -> BooleanUtils.isNotFalse(element.getDocument().isSource()))
25+
.forEach(token -> {
26+
elementThresholdMatching(token, elementTokenScore, matchElements);
27+
});
2928

30-
return groupBy.entrySet().parallelStream().flatMap(leftElementEntry ->
31-
leftElementEntry.getValue().entrySet().parallelStream().map(rightElementEntry -> {
32-
List<Score> childScoreList = rightElementEntry.getValue()
33-
.stream().map(d -> d.getScore())
34-
.collect(Collectors.toList());
29+
tokens.forEach(token -> tokenRepo.put(token));
3530

36-
return new Match<Element>(leftElementEntry.getKey(), rightElementEntry.getKey(), childScoreList);
37-
}).filter(match -> match.getResult() > match.getData().getThreshold()));
31+
return matchElements;
3832
}
3933

40-
34+
private void elementThresholdMatching(Token token, Map<Element, Integer> elementTokenScore, Set<Match<Element>> matchingElements) {
35+
Set<Element> matchElements = tokenRepo.get(token);
36+
Element element = token.getElement();
37+
38+
// Token Match Found
39+
if (matchElements != null) {
40+
matchElements.forEach(matchElement -> {
41+
int score = elementTokenScore.getOrDefault(matchElement, 0) + 1;
42+
elementTokenScore.put(matchElement, score);
43+
// Element Score above threshold
44+
double elementScore = element.getScore(score, matchElement);
45+
46+
// Element match Found
47+
if (elementScore > element.getThreshold()) {
48+
Match<Element> elementMatch = new Match<>(element, matchElement, elementScore);
49+
matchingElements.remove(elementMatch);
50+
matchingElements.add(elementMatch);
51+
}
52+
});
53+
}
54+
}
4155
}

src/main/java/com/intuit/fuzzymatcher/component/MatchService.java

+19-14
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
*/
2121
public class MatchService {
2222

23-
private static DocumentMatch documentMatch = new DocumentMatch();
24-
2523
/**
2624
* Use this for De-duplication of data, where for a given list of documents it finds duplicates
2725
* Data is aggregated by a given Document
@@ -30,7 +28,8 @@ public class MatchService {
3028
* @return a map containing the grouping of each document and its corresponding matches
3129
*/
3230
public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents) {
33-
return documentMatch.matchDocuments(documents.parallelStream())
31+
DocumentMatch documentMatch = new DocumentMatch();
32+
return documentMatch.matchDocuments(documents.stream())
3433
.collect(Collectors.groupingBy(Match::getData));
3534
}
3635

@@ -43,13 +42,15 @@ public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents)
4342
* @return a map containing the grouping of each document and its corresponding matches
4443
*/
4544
public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents, List<Document> matchWith) {
45+
DocumentMatch documentMatch = new DocumentMatch();
4646
return documentMatch.matchDocuments(Stream.concat(
47-
documents.parallelStream().map(document -> {
48-
document.setSource(true);
49-
return document;
50-
}), matchWith.parallelStream().map(document -> {
47+
matchWith.stream().map(document -> {
5148
document.setSource(false);
5249
return document;
50+
}),
51+
documents.stream().map(document -> {
52+
document.setSource(true);
53+
return document;
5354
})))
5455
.collect(Collectors.groupingBy(Match::getData));
5556
}
@@ -58,23 +59,25 @@ public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents,
5859
* Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
5960
* Data is aggregated by a given Document
6061
*
61-
* @param document the document to match
62+
* @param document the document to match
6263
* @param matchWith the list of documents to match against
6364
* @return a map containing the grouping of each document and its corresponding matches
6465
*/
6566
public Map<Document, List<Match<Document>>> applyMatch(Document document, List<Document> matchWith) {
67+
DocumentMatch documentMatch = new DocumentMatch();
6668
return applyMatch(Arrays.asList(document), matchWith);
6769
}
6870

6971
/**
7072
* Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
7173
* Data is aggregated by a given Document Id
7274
*
73-
* @param document the document to match
75+
* @param document the document to match
7476
* @param matchWith the list of documents to match against
7577
* @return a map containing the grouping of each document id and its corresponding matches
7678
*/
7779
public Map<String, List<Match<Document>>> applyMatchByDocId(Document document, List<Document> matchWith) {
80+
DocumentMatch documentMatch = new DocumentMatch();
7881
return applyMatchByDocId(Arrays.asList(document), matchWith);
7982
}
8083

@@ -86,7 +89,8 @@ public Map<String, List<Match<Document>>> applyMatchByDocId(Document document, L
8689
* @return a map containing the grouping of each document id and its corresponding matches
8790
*/
8891
public Map<String, List<Match<Document>>> applyMatchByDocId(List<Document> documents) {
89-
return documentMatch.matchDocuments(documents.parallelStream())
92+
DocumentMatch documentMatch = new DocumentMatch();
93+
return documentMatch.matchDocuments(documents.stream())
9094
.collect(Collectors.groupingBy(match -> match.getData().getKey()));
9195
}
9296

@@ -99,13 +103,14 @@ public Map<String, List<Match<Document>>> applyMatchByDocId(List<Document> docum
99103
* @return a map containing the grouping of each document id and its corresponding matches
100104
*/
101105
public Map<String, List<Match<Document>>> applyMatchByDocId(List<Document> documents, List<Document> matchWith) {
106+
DocumentMatch documentMatch = new DocumentMatch();
102107
return documentMatch.matchDocuments(Stream.concat(
103-
documents.parallelStream().map(document -> {
104-
document.setSource(true);
105-
return document;
106-
}), matchWith.parallelStream().map(document -> {
108+
matchWith.stream().map(document -> {
107109
document.setSource(false);
108110
return document;
111+
}), documents.stream().map(document -> {
112+
document.setSource(true);
113+
return document;
109114
})))
110115
.collect(Collectors.groupingBy(match -> match.getData().getKey()));
111116
}

src/main/java/com/intuit/fuzzymatcher/component/TokenMatch.java

-21
This file was deleted.

0 commit comments

Comments
 (0)