Skip to content

Commit

Permalink
refactor: Improve string comparison robustness
Browse files Browse the repository at this point in the history
* Fix undefined token inclusion caused by token lists with differing lengths
* Always compare longer to shorter string so sameness parameter order is invariant
* Add comments to make logic easier to understand
* Add tests to test new functionality
  • Loading branch information
FoxxMD committed Jan 2, 2024
1 parent 064a435 commit f126bf0
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 18 deletions.
1 change: 0 additions & 1 deletion src/backend/tests/scrobbler/scrobblers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ describe('Detects duplicate and unique scrobbles from client recent history', fu

const sonDiffPlay = clone(son);
sonDiffPlay.data.playDate = sonDiffPlay.data.playDate.subtract(son.data.duration + 1, 's');
sonDiffPlay.data.artists = [sonDiffPlay.data.artists[1]]
assert.isTrue(await testScrobbler.alreadyScrobbled(sonDiffPlay));
});

Expand Down
21 changes: 21 additions & 0 deletions src/backend/tests/strings.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,25 @@ describe('String Comparisons', function () {
assert.isAtMost( result.highScore, 99, `Comparing: '${test[0]}' | '${test[1]}'`);
}
});

it('should handle strings with different lengths', async function () {
const tests = [
['The Amazing Bongo Hop', 'The Bongo Hop'],
]

for(const test of tests) {
const result = compareNormalizedStrings(test[0], test[1]);
assert.isAtMost( result.highScore, 53, `Comparing: '${test[0]}' | '${test[1]}'`);
}
});

it('should be string parameter order invariant', async function () {
const longerString = 'Nidia Gongora TEST';
const shorterString = 'Nidia Gongora'

const result1 = compareNormalizedStrings(longerString, shorterString);
const result2 = compareNormalizedStrings(shorterString, longerString);

assert.equal( result1.highScore, result2.highScore, `Comparing: '${longerString}' | '${shorterString}'`);
});
});
76 changes: 59 additions & 17 deletions src/backend/utils/StringUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,23 +167,55 @@ export const compareScrobbleArtists = (existing: PlayObject, candidate: PlayObje
return compareNormalizedStrings(existingArtists.reduce((acc, curr) => `${acc} ${curr}`, ''), candidateArtists.reduce((acc, curr) => `${acc} ${curr}`, '')).highScore;
}

/**
* Compare the sameness of two strings after making them token-order independent
*
* Transform two strings before comparing in order to have as little difference between them as possible:
*
* * First, normalize (lower case, remove extraneous whitespace, remove punctuation, make all characters standard ANSI) strings and split into tokens
* * Second, reorder tokens in the shorter list so that they mirror order of tokens in longer list as closely as possible
* * Finally, concat back to strings and compare with sameness strategies
*
* */
export const compareNormalizedStrings = (existing: string, candidate: string): StringSamenessResult => {

// there may be scenarios where a track differs in *ordering* of ancillary information between sources
// EX My Track (feat. Art1, Art2) -- My Track (feat. Art2 Art1)

// first remove lower case, extraneous whitespace, punctuation, and replace non-ansi with ansi characters
const normalExisting = normalizeStr(existing, {keepSingleWhitespace: true});
const normalCandidate = normalizeStr(candidate, {keepSingleWhitespace: true});

// there may be scenarios where a track differs in *ordering* of ancillary information between sources
// EX My Track (feat. Art1, Art2) -- My Track (feat. Art2 Art1)
// so instead of naively comparing the entire track string against the candidate we
// * first try to match up all white-space separated tokens
// * recombine with closest tokens in order
// * then check sameness
// split by "token"
const eTokens = normalExisting.split(' ');
const cTokens = normalCandidate.split(' ');

const orderedCandidateTokens = eTokens.reduce((acc: { ordered: string[], remaining: string[] }, curr) => {

let longerTokens: string[],
shorterTokens: string[];

if (eTokens.length > cTokens.length) {
longerTokens = eTokens;
shorterTokens = cTokens;
} else {
longerTokens = cTokens;
shorterTokens = eTokens;
}

// we will use longest string (token list) as the reducer and order the shorter list to match it
// so we don't have to deal with undefined positions in the shorter list

const orderedCandidateTokens = longerTokens.reduce((acc: { ordered: string[], remaining: string[] }, curr) => {
// if we've run out of tokens in the shorter list just return
if (acc.remaining.length === 0) {
return acc;
}

// on each iteration of tokens in the long list
// we iterate through remaining tokens from the shorter list and find the token with the most sameness

let highScore = 0;
let highIndex = undefined;
let highIndex = 0;
let index = 0;
for (const token of acc.remaining) {
const result = stringSameness(curr, token);
Expand All @@ -194,18 +226,28 @@ export const compareNormalizedStrings = (existing: string, candidate: string): S
index++;
}

// then remove the most same token from the remaining short list tokens
const splicedRemaining = [...acc.remaining];
splicedRemaining.splice(highIndex, 1);

return {ordered: acc.ordered.concat(acc.remaining[highIndex]), remaining: splicedRemaining};
}, {ordered: [], remaining: cTokens});

const allOrderedCandidateTokens = orderedCandidateTokens.ordered.concat(orderedCandidateTokens.remaining);
const orderedCandidateString = allOrderedCandidateTokens.join(' ');

// since we have already "matched" up words by order we don't want to use cosine strat
return {
// finally add the most same token to the ordered short list
ordered: acc.ordered.concat(acc.remaining[highIndex]),
// and return the remaining short list tokens
remaining: splicedRemaining
};
}, {
// "ordered" is the result of ordering tokens in the shorter list to match longer token order
ordered: [],
// remaining is the initial shorter list
remaining: shorterTokens
});

// since we have already "matched" up tokens by order we don't want to use cosine strat
// bc it only does comparisons between whole words in a sentence (instead of all letters in a string)
// which makes it inaccurate for small-n sentences and typos

return stringSameness(normalExisting, orderedCandidateString, {transforms: [], strategies: [levenStrategy, diceStrategy]});
return stringSameness(longerTokens.join(' '), orderedCandidateTokens.ordered.join(' '), {
transforms: [],
strategies: [levenStrategy, diceStrategy]
})
}

0 comments on commit f126bf0

Please sign in to comment.