-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembeddings-transformers.js
45 lines (35 loc) · 1.46 KB
/
embeddings-transformers.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import * as fs from 'fs';
import { pipeline } from '@huggingface/transformers';
// Load the embeddings model
const extractor = await pipeline('feature-extraction', 'mixedbread-ai/mxbai-embed-large-v1', {
progress_callback: (x) => {
if (x.progress) process.stdout.write(`\r${Math.round(x.progress)}% loaded`);
},
});
// Read and split the transcript into chunks
console.log('\nReading source file...');
const raw = fs.readFileSync('data/p5.txt', 'utf-8');
let chunks = raw.split(/\n+/);
// Trim each chunk and filter out empty strings
chunks = chunks.map((chunk) => chunk.trim()).filter((chunk) => chunk !== '');
console.log(`Total chunks to process: ${chunks.length}`);
// Start the process of generating embeddings
console.log('Starting embedding generation...');
// Array to store all embeddings in output JSON
const outputJSON = { embeddings: [] };
// Process chunks in batches
for (let i = 0; i < chunks.length; i++) {
process.stdout.write(`\rProcessing ${i + 1}/${Math.ceil(chunks.length)}`);
// Generate the embedding from text
const output = await extractor(chunks[i], {
pooling: 'cls',
});
// Extract the embedding output
const embedding = output.tolist()[0];
// Add current batch embeddings
outputJSON.embeddings.push({ text: chunks[i], embedding });
}
// Write the embeddings to a JSON file
const fileOut = 'embeddings/p5-embeddings-tf.json';
fs.writeFileSync(fileOut, JSON.stringify(outputJSON));
console.log(`\nEmbeddings saved to ${fileOut}`);