-
Notifications
You must be signed in to change notification settings - Fork 1
/
embeddings-transformers.js
85 lines (69 loc) · 2.66 KB
/
embeddings-transformers.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import * as fs from 'fs';
import { pipeline } from '@xenova/transformers';
// Load the embeddings model
const extractor = await pipeline(
'feature-extraction',
'Xenova/bge-small-en-v1.5'
);
const fullOutput = [];
(async () => {
// Scan transcripts directory for all json files
const files = fs.readdirSync('transcripts');
// Iterate through each file and calculate the embeddings
for (const file of files) {
const rawContents = fs.readFileSync(`transcripts/${file}`, 'utf-8');
const json = JSON.parse(rawContents);
const text = json.text;
// Calculate chunks based on this text
const chunks = calculateChunks(text);
// Extract embeddings for each chunk
const output = [];
for (const chunk of chunks) {
const embeddingOutput = await extractor(chunk, {
pooling: 'mean',
normalize: true,
});
const embedding = embeddingOutput.tolist()[0];
output.push({ text: chunk, embedding });
fullOutput.push({ text: chunk, embedding });
}
// Save the embeddings to a file
const fileOut = `embeddings/${file}`;
fs.writeFileSync(fileOut, JSON.stringify(output));
console.log(
`Embeddings saved for ${file} to ${fileOut} (${
output.length
} chunks) (${files.indexOf(file) + 1}/${files.length})`
);
}
// Save the full output to a single file
const fileOut = `embeddings.json`;
fs.writeFileSync(fileOut, JSON.stringify(fullOutput));
console.log(`Complete embeddings saved to ${fileOut}`);
})();
function calculateChunks(text) {
// We want to split the text into chunks of at least 100 characters, after this we will keep adding to the chunk until we find a sentence boundary
const chunks = [];
let chunk = '';
for (let i = 0; i < text.length; i++) {
chunk += text[i];
// If our current character is a punctuation mark, we will split the chunk here
if (
chunk.length >= 100 &&
(text[i] === '.' || text[i] === '?' || text[i] === '!')
) {
chunks.push(chunk.trim());
chunk = '';
}
// If we are exceeding 150 characters and we haven't found a punctuation mark, we will split the chunk at the last space
if (chunk.length >= 150) {
let lastSpace = chunk.lastIndexOf(' ');
if (lastSpace === -1) {
lastSpace = chunk.length;
}
chunks.push(chunk.slice(0, lastSpace).trim());
chunk = chunk.slice(lastSpace).trim();
}
}
return chunks;
}