const db = require('../db'); const embeddings = require('./embeddings'); /** * Split text into chunks using a sliding window. * Default: 500 chars per chunk, 50 char overlap. * Cap at 200 chunks per PDF. */ function chunkText(text, size = 500, overlap = 50) { if (!text || typeof text !== 'string') return []; const step = size - overlap; const chunks = []; for (let i = 0; i < text.length; i += step) { chunks.push(text.slice(i, i + size)); if (chunks.length >= 200) break; } return chunks; } /** * Cosine similarity between two Float32Arrays. * Returns a value in [-1, 1]. */ function cosineSimilarity(a, b) { if (a.length !== b.length) { throw new Error(`cosineSimilarity: length mismatch ${a.length} vs ${b.length}`); } let dot = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { const ai = a[i]; const bi = b[i]; dot += ai * bi; normA += ai * ai; normB += bi * bi; } if (normA === 0 || normB === 0) return 0; return dot / (Math.sqrt(normA) * Math.sqrt(normB)); } /** * Re-export embed for clarity. */ async function embedQuery(text) { return embeddings.embed(text); } /** * Find top K most relevant chunks for a query vector. * @param {Float32Array} queryVec * @param {number[]} pdfIds * @param {number} k * @returns {Promise<{pdf_id, chunk_index, content, similarity}[]>} */ async function topK(queryVec, pdfIds, k = 3) { if (!pdfIds || pdfIds.length === 0) return []; const placeholders = pdfIds.map(() => '?').join(','); const rows = db.prepare( `SELECT pdf_id, chunk_index, vector, content FROM embeddings WHERE pdf_id IN (${placeholders})` ).all(...pdfIds); if (rows.length === 0) return []; const scored = rows.map((row) => { const buf = Buffer.from(row.vector); const chunkVec = new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4); const similarity = cosineSimilarity(queryVec, chunkVec); return { pdf_id: row.pdf_id, chunk_index: row.chunk_index, content: row.content, similarity, }; }); scored.sort((a, b) => b.similarity - a.similarity); return scored.slice(0, k); } module.exports = { chunkText, cosineSimilarity, embedQuery, topK, };