Skip to content

Commit

Permalink
Deduplicate tags
Browse files Browse the repository at this point in the history
  • Loading branch information
tedspare committed Nov 8, 2024
1 parent ccf3d10 commit ba4f543
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 9 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
- [2024-11-08] [Deduplicate tags](https://github.com/RubricLab/memory/commit/f310bf6ca22d794764fe470e54687025be81137b)
- [2024-10-28] [Loosen requirement on tag match](https://github.com/RubricLab/memory/commit/691545e6a012c6f0a7263dbc4d2ceff9837dc3fd)
- [2024-10-24] [add notify-monorepo action](https://github.com/RubricLab/memory/commit/53d286bb680f8785ebd63806f5bc2a4514ec9bbc)
- [2024-10-22] [Bun x kysely codegen](https://github.com/RubricLab/memory/commit/d0dfa75f9bc720454301e751d91bf8410e7d60f5)
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "@rubriclab/memory",
"module": "src/index.ts",
"main": "src/index.ts",
"version": "0.0.44",
"version": "0.0.45",
"private": false,
"type": "module",
"devDependencies": {
Expand Down
53 changes: 45 additions & 8 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -220,21 +220,59 @@ export class Memory {
console.log(`extract completed: ${(performance.now() - start).toFixed(2)}ms`)

const uniqueTags = Array.from(new Set(tags))

const similarTagSearchRes = await Promise.all(uniqueTags.map(tag => this.search(tag, { userId })))
const similarTags = similarTagSearchRes.flat()

console.log(`search completed: ${(performance.now() - start).toFixed(2)}ms`)

const uniqueSimilarTagIds = Array.from(new Set(similarTags.map(s => s.tagId)))
const similarTags = similarTagSearchRes.flat()
console.log('similarTags', similarTags)

const uniqueSimilarTagIds = Array.from(new Set(similarTags.map(s => s.tagId)))

// Add duplicate detection
const {
object: { duplicates }
} = await generateObject({
model: openai(this.model),
schema: z.object({
duplicates: z.array(
z.object({
newTag: z.string(),
existingTag: z.string(),
confidence: z.number().min(0).max(1)
})
)
}),
prompt: clean`Please identify any duplicate tags from the following lists, accounting for variations in spelling, nicknames, or formatting.
Only mark as duplicates if you are highly confident they refer to the same entity.
Existing tags:
${similarTags.map(s => s.tagBody).join('\n')}
New tags:
${uniqueTags.join('\n')}
Return pairs of duplicates with a confidence score.
A confidence of 0.9+ means very likely match (e.g., "NYC" vs "New York City")`
})

const CONFIDENCE_THRESHOLD = 0.5

// Filter out duplicates, keeping existing tags when there's a match
const netNewTags = uniqueTags.filter(
t => !similarTags.some(s => s?.tagBody.toLowerCase() === t.toLowerCase())
t =>
!similarTags.some(s => s.tagBody.toLowerCase() === t.toLowerCase()) && // exact match check
!duplicates.some(d => d.newTag === t && d.confidence >= CONFIDENCE_THRESHOLD) // AI-detected duplicate check
)
console.log('netNewTags', netNewTags)

const tagsInserted = await this.insert({ tags: netNewTags }, { userId })
// Replace any remaining new tags with their existing versions if they're duplicates
const finalTags = netNewTags.map(tag => {
const duplicate = duplicates.find(d => d.newTag === tag && d.confidence >= CONFIDENCE_THRESHOLD)
return duplicate ? duplicate.existingTag : tag
})

console.log('netNewTags', finalTags)

const tagsInserted = await this.insert({ tags: finalTags }, { userId })
const netNewTagIds = tagsInserted.map(t => t.id)

console.log(`insert completed: ${(performance.now() - start).toFixed(2)}ms`)
Expand Down Expand Up @@ -268,7 +306,6 @@ export class Memory {
)
.default([])
}),
// TODO: this prompt/schema is not evalling well
prompt: clean`Given the following facts and some new information, please identify any existing facts that have been proven wrong by the new information.
You should only delete facts that have been overwritten by the new facts.
This means it is common to not delete anything.
Expand Down

0 comments on commit ba4f543

Please sign in to comment.