5 SDD batches archived: - Batch 1: UI Polish (10 features, 14 tasks) - Batch 2: Study System (8 features, 23 tasks) - Batch 3: Infrastructure (5 features, 22 tasks) - Batch 4: AI Advanced (5 features, 30 tasks) — RAG with @xenova/transformers - Batch 5: Core Features (5 features, 19 tasks) 37 bugs fixed from comprehensive code review (11 CRITICAL, 12 HIGH, 14 MEDIUM/LOW): - SSE streaming now works (event.token check) - API keys no longer exposed via GET /api/models - FTS5 injection sanitized - DB backup/restore with admin auth - Buddy mode wired (buddy_meta column) - Exam auto-submit stale closure fixed - CSS variables aligned with design tokens - Progress data corruption fixed - WebSocket protocol auto-detection - Tests infrastructure completed (vitest + node:test)
88 lines
2.2 KiB
JavaScript
88 lines
2.2 KiB
JavaScript
const db = require('../db');
|
|
const embeddings = require('./embeddings');
|
|
|
|
/**
|
|
* Split text into chunks using a sliding window.
|
|
* Default: 500 chars per chunk, 50 char overlap.
|
|
* Cap at 200 chunks per PDF.
|
|
*/
|
|
function chunkText(text, size = 500, overlap = 50) {
|
|
if (!text || typeof text !== 'string') return [];
|
|
const step = size - overlap;
|
|
const chunks = [];
|
|
for (let i = 0; i < text.length; i += step) {
|
|
chunks.push(text.slice(i, i + size));
|
|
if (chunks.length >= 200) break;
|
|
}
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Cosine similarity between two Float32Arrays.
|
|
* Returns a value in [-1, 1].
|
|
*/
|
|
function cosineSimilarity(a, b) {
|
|
if (a.length !== b.length) {
|
|
throw new Error(`cosineSimilarity: length mismatch ${a.length} vs ${b.length}`);
|
|
}
|
|
let dot = 0;
|
|
let normA = 0;
|
|
let normB = 0;
|
|
for (let i = 0; i < a.length; i++) {
|
|
const ai = a[i];
|
|
const bi = b[i];
|
|
dot += ai * bi;
|
|
normA += ai * ai;
|
|
normB += bi * bi;
|
|
}
|
|
if (normA === 0 || normB === 0) return 0;
|
|
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
}
|
|
|
|
/**
|
|
* Re-export embed for clarity.
|
|
*/
|
|
async function embedQuery(text) {
|
|
return embeddings.embed(text);
|
|
}
|
|
|
|
/**
|
|
* Find top K most relevant chunks for a query vector.
|
|
* @param {Float32Array} queryVec
|
|
* @param {number[]} pdfIds
|
|
* @param {number} k
|
|
* @returns {Promise<{pdf_id, chunk_index, content, similarity}[]>}
|
|
*/
|
|
async function topK(queryVec, pdfIds, k = 3) {
|
|
if (!pdfIds || pdfIds.length === 0) return [];
|
|
|
|
const placeholders = pdfIds.map(() => '?').join(',');
|
|
const rows = db.prepare(
|
|
`SELECT pdf_id, chunk_index, vector, content FROM embeddings WHERE pdf_id IN (${placeholders})`
|
|
).all(...pdfIds);
|
|
|
|
if (rows.length === 0) return [];
|
|
|
|
const scored = rows.map((row) => {
|
|
const buf = Buffer.from(row.vector);
|
|
const chunkVec = new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
|
|
const similarity = cosineSimilarity(queryVec, chunkVec);
|
|
return {
|
|
pdf_id: row.pdf_id,
|
|
chunk_index: row.chunk_index,
|
|
content: row.content,
|
|
similarity,
|
|
};
|
|
});
|
|
|
|
scored.sort((a, b) => b.similarity - a.similarity);
|
|
return scored.slice(0, k);
|
|
}
|
|
|
|
module.exports = {
|
|
chunkText,
|
|
cosineSimilarity,
|
|
embedQuery,
|
|
topK,
|
|
};
|