feat: implement 33 nice-to-have features + fix 37 code review bugs
5 SDD batches archived: - Batch 1: UI Polish (10 features, 14 tasks) - Batch 2: Study System (8 features, 23 tasks) - Batch 3: Infrastructure (5 features, 22 tasks) - Batch 4: AI Advanced (5 features, 30 tasks) — RAG with @xenova/transformers - Batch 5: Core Features (5 features, 19 tasks) 37 bugs fixed from comprehensive code review (11 CRITICAL, 12 HIGH, 14 MEDIUM/LOW): - SSE streaming now works (event.token check) - API keys no longer exposed via GET /api/models - FTS5 injection sanitized - DB backup/restore with admin auth - Buddy mode wired (buddy_meta column) - Exam auto-submit stale closure fixed - CSS variables aligned with design tokens - Progress data corruption fixed - WebSocket protocol auto-detection - Tests infrastructure completed (vitest + node:test)
This commit is contained in:
@@ -3,6 +3,8 @@ const multer = require('multer');
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const db = require('../db');
|
||||
const { chunkText } = require('../lib/rag');
|
||||
const embeddings = require('../lib/embeddings');
|
||||
const router = express.Router();
|
||||
|
||||
const uploadDir = path.resolve(__dirname, '..', '..', 'data', 'uploads');
|
||||
@@ -45,7 +47,8 @@ async function vlmExtract(filePath, vlmConfig) {
|
||||
const base64 = buffer.toString('base64');
|
||||
const mimeType = 'application/pdf';
|
||||
|
||||
const resp = await fetch(`${vlmConfig.endpoint}/chat/completions`, {
|
||||
const baseUrl = vlmConfig.endpoint.replace(/\/+$/, '');
|
||||
const resp = await fetch(`${baseUrl}/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${vlmConfig.api_key}`,
|
||||
@@ -90,9 +93,10 @@ router.post('/upload', upload.single('file'), async (req, res) => {
|
||||
let usedVlm = false;
|
||||
|
||||
// 1. Try VLM first
|
||||
const vlmEndpoint = db.prepare("SELECT value FROM config WHERE key = 'vlm_endpoint'").get();
|
||||
const vlmApiKey = db.prepare("SELECT value FROM config WHERE key = 'vlm_api_key'").get();
|
||||
const vlmModel = db.prepare("SELECT value FROM config WHERE key = 'vlm_model'").get();
|
||||
const configs = db.prepare("SELECT key, value FROM config WHERE key IN ('vlm_endpoint','vlm_api_key','vlm_model')").all();
|
||||
const vlmEndpoint = configs.find(c => c.key === 'vlm_endpoint');
|
||||
const vlmApiKey = configs.find(c => c.key === 'vlm_api_key');
|
||||
const vlmModel = configs.find(c => c.key === 'vlm_model');
|
||||
|
||||
if (vlmEndpoint?.value && vlmApiKey?.value) {
|
||||
const vlmConfig = {
|
||||
@@ -101,7 +105,16 @@ router.post('/upload', upload.single('file'), async (req, res) => {
|
||||
model: vlmModel?.value || 'glm-4.6v',
|
||||
};
|
||||
text = await vlmExtract(filePath, vlmConfig);
|
||||
if (text) usedVlm = true;
|
||||
if (text) {
|
||||
usedVlm = true;
|
||||
// VLM succeeded — still get page count from PDF metadata
|
||||
try {
|
||||
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
||||
const data = new Uint8Array(fs.readFileSync(filePath));
|
||||
const doc = await pdfjsLib.getDocument({ data }).promise;
|
||||
pages = doc.numPages;
|
||||
} catch { /* keep pages = 0 if metadata read fails */ }
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Fallback to pdfjs-dist
|
||||
@@ -119,7 +132,26 @@ router.post('/upload', upload.single('file'), async (req, res) => {
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
`).run(req.file.filename, req.file.originalname, text || '', pages || 0, reorderIndex);
|
||||
|
||||
const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(info.lastInsertRowid);
|
||||
const pdfId = info.lastInsertRowid;
|
||||
|
||||
// Generate embeddings for the PDF text
|
||||
if (text && text.trim().length > 0) {
|
||||
try {
|
||||
const chunks = chunkText(text, 500, 50);
|
||||
const vectors = await embeddings.embedBatch(chunks);
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const vec = vectors[i];
|
||||
const blob = Buffer.from(vec.buffer);
|
||||
db.prepare(
|
||||
'INSERT INTO embeddings (pdf_id, chunk_index, vector, content) VALUES (?, ?, ?, ?)'
|
||||
).run(pdfId, i, blob, chunks[i]);
|
||||
}
|
||||
} catch (embErr) {
|
||||
console.warn('[pdfs] embedding generation failed:', embErr.message);
|
||||
}
|
||||
}
|
||||
|
||||
const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(pdfId);
|
||||
res.status(201).json({ ...row, used_vlm: usedVlm });
|
||||
} catch (err) {
|
||||
console.error('[pdfs] upload error:', err.message);
|
||||
@@ -196,10 +228,10 @@ router.delete('/:id', (req, res) => {
|
||||
|
||||
// Delete file from disk
|
||||
const filePath = path.join(uploadDir, row.filename);
|
||||
if (fs.existsSync(filePath)) {
|
||||
fs.unlinkSync(filePath);
|
||||
}
|
||||
try { fs.unlinkSync(filePath); } catch {}
|
||||
|
||||
// Delete embeddings first
|
||||
db.prepare('DELETE FROM embeddings WHERE pdf_id = ?').run(id);
|
||||
db.prepare('DELETE FROM pdfs WHERE id = ?').run(id);
|
||||
res.json({ deleted: true });
|
||||
} catch (err) {
|
||||
@@ -208,4 +240,26 @@ router.delete('/:id', (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/pdfs/:id/embeddings — list embeddings for a PDF (debug)
|
||||
router.get('/:id/embeddings', (req, res) => {
|
||||
const id = parseInt(req.params.id, 10);
|
||||
if (Number.isNaN(id)) {
|
||||
return res.status(400).json({ error: 'Invalid pdf id' });
|
||||
}
|
||||
|
||||
try {
|
||||
const pdf = db.prepare('SELECT id FROM pdfs WHERE id = ?').get(id);
|
||||
if (!pdf) {
|
||||
return res.status(404).json({ error: 'PDF not found' });
|
||||
}
|
||||
|
||||
const rows = db.prepare('SELECT id, pdf_id, chunk_index, content, created_at FROM embeddings WHERE pdf_id = ? ORDER BY chunk_index')
|
||||
.all(id);
|
||||
res.json({ pdf_id: id, count: rows.length, chunks: rows });
|
||||
} catch (err) {
|
||||
console.error('[pdfs] embeddings error:', err.message);
|
||||
res.status(500).json({ error: err.message });
|
||||
}
|
||||
});
|
||||
|
||||
module.exports = router;
|
||||
|
||||
Reference in New Issue
Block a user