5 SDD batches archived: - Batch 1: UI Polish (10 features, 14 tasks) - Batch 2: Study System (8 features, 23 tasks) - Batch 3: Infrastructure (5 features, 22 tasks) - Batch 4: AI Advanced (5 features, 30 tasks) — RAG with @xenova/transformers - Batch 5: Core Features (5 features, 19 tasks) 37 bugs fixed from comprehensive code review (11 CRITICAL, 12 HIGH, 14 MEDIUM/LOW): - SSE streaming now works (event.token check) - API keys no longer exposed via GET /api/models - FTS5 injection sanitized - DB backup/restore with admin auth - Buddy mode wired (buddy_meta column) - Exam auto-submit stale closure fixed - CSS variables aligned with design tokens - Progress data corruption fixed - WebSocket protocol auto-detection - Tests infrastructure completed (vitest + node:test)
266 lines
8.8 KiB
JavaScript
266 lines
8.8 KiB
JavaScript
const express = require('express');
|
|
const multer = require('multer');
|
|
const path = require('path');
|
|
const fs = require('fs');
|
|
const db = require('../db');
|
|
const { chunkText } = require('../lib/rag');
|
|
const embeddings = require('../lib/embeddings');
|
|
const router = express.Router();
|
|
|
|
const uploadDir = path.resolve(__dirname, '..', '..', 'data', 'uploads');
|
|
if (!fs.existsSync(uploadDir)) {
|
|
fs.mkdirSync(uploadDir, { recursive: true });
|
|
}
|
|
|
|
const storage = multer.diskStorage({
|
|
destination: (req, file, cb) => cb(null, uploadDir),
|
|
filename: (req, file, cb) => {
|
|
const unique = Date.now() + '-' + Math.round(Math.random() * 1e9);
|
|
cb(null, unique + path.extname(file.originalname));
|
|
},
|
|
});
|
|
|
|
const upload = multer({ storage, limits: { fileSize: 50 * 1024 * 1024 } }); // 50MB
|
|
|
|
async function extractPDFText(filePath) {
|
|
try {
|
|
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
|
const data = new Uint8Array(fs.readFileSync(filePath));
|
|
const doc = await pdfjsLib.getDocument({ data }).promise;
|
|
const texts = [];
|
|
for (let i = 1; i <= doc.numPages; i++) {
|
|
const page = await doc.getPage(i);
|
|
const textContent = await page.getTextContent();
|
|
const pageText = textContent.items.map(item => item.str).join(' ');
|
|
texts.push(pageText);
|
|
}
|
|
return { text: texts.join('\n\n---\n\n'), pages: doc.numPages };
|
|
} catch (err) {
|
|
console.error('[pdfs] pdfjs extract error:', err.message);
|
|
return { text: '', pages: 0 };
|
|
}
|
|
}
|
|
|
|
async function vlmExtract(filePath, vlmConfig) {
|
|
try {
|
|
const buffer = fs.readFileSync(filePath);
|
|
const base64 = buffer.toString('base64');
|
|
const mimeType = 'application/pdf';
|
|
|
|
const baseUrl = vlmConfig.endpoint.replace(/\/+$/, '');
|
|
const resp = await fetch(`${baseUrl}/chat/completions`, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Authorization': `Bearer ${vlmConfig.api_key}`,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({
|
|
model: vlmConfig.model || 'glm-4.6v',
|
|
messages: [{
|
|
role: 'user',
|
|
content: [
|
|
{ type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64}` } },
|
|
{ type: 'text', text: 'Extract all text from this document as markdown. Preserve structure: headers, lists, paragraphs, tables. Be thorough.' },
|
|
],
|
|
}],
|
|
max_tokens: 4096,
|
|
}),
|
|
});
|
|
|
|
if (!resp.ok) {
|
|
const errText = await resp.text().catch(() => '');
|
|
throw new Error(`VLM HTTP ${resp.status}: ${errText.substring(0, 200)}`);
|
|
}
|
|
|
|
const data = await resp.json();
|
|
return data.choices?.[0]?.message?.content || data.text || data.markdown || '';
|
|
} catch (err) {
|
|
console.error('[pdfs] VLM error:', err.message);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// POST /api/pdfs/upload
|
|
router.post('/upload', upload.single('file'), async (req, res) => {
|
|
if (!req.file) {
|
|
return res.status(400).json({ error: 'No file uploaded' });
|
|
}
|
|
|
|
try {
|
|
const filePath = req.file.path;
|
|
let text = '';
|
|
let pages = 0;
|
|
let usedVlm = false;
|
|
|
|
// 1. Try VLM first
|
|
const configs = db.prepare("SELECT key, value FROM config WHERE key IN ('vlm_endpoint','vlm_api_key','vlm_model')").all();
|
|
const vlmEndpoint = configs.find(c => c.key === 'vlm_endpoint');
|
|
const vlmApiKey = configs.find(c => c.key === 'vlm_api_key');
|
|
const vlmModel = configs.find(c => c.key === 'vlm_model');
|
|
|
|
if (vlmEndpoint?.value && vlmApiKey?.value) {
|
|
const vlmConfig = {
|
|
endpoint: vlmEndpoint.value,
|
|
api_key: vlmApiKey.value,
|
|
model: vlmModel?.value || 'glm-4.6v',
|
|
};
|
|
text = await vlmExtract(filePath, vlmConfig);
|
|
if (text) {
|
|
usedVlm = true;
|
|
// VLM succeeded — still get page count from PDF metadata
|
|
try {
|
|
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
|
const data = new Uint8Array(fs.readFileSync(filePath));
|
|
const doc = await pdfjsLib.getDocument({ data }).promise;
|
|
pages = doc.numPages;
|
|
} catch { /* keep pages = 0 if metadata read fails */ }
|
|
}
|
|
}
|
|
|
|
// 2. Fallback to pdfjs-dist
|
|
if (!text || text.trim().length === 0) {
|
|
const extracted = await extractPDFText(filePath);
|
|
text = extracted.text;
|
|
pages = extracted.pages;
|
|
}
|
|
|
|
const maxReorder = db.prepare('SELECT MAX(reorder_index) as maxIdx FROM pdfs').get();
|
|
const reorderIndex = (maxReorder?.maxIdx ?? -1) + 1;
|
|
|
|
const info = db.prepare(`
|
|
INSERT INTO pdfs (filename, original_name, content_markdown, pages, reorder_index)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
`).run(req.file.filename, req.file.originalname, text || '', pages || 0, reorderIndex);
|
|
|
|
const pdfId = info.lastInsertRowid;
|
|
|
|
// Generate embeddings for the PDF text
|
|
if (text && text.trim().length > 0) {
|
|
try {
|
|
const chunks = chunkText(text, 500, 50);
|
|
const vectors = await embeddings.embedBatch(chunks);
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
const vec = vectors[i];
|
|
const blob = Buffer.from(vec.buffer);
|
|
db.prepare(
|
|
'INSERT INTO embeddings (pdf_id, chunk_index, vector, content) VALUES (?, ?, ?, ?)'
|
|
).run(pdfId, i, blob, chunks[i]);
|
|
}
|
|
} catch (embErr) {
|
|
console.warn('[pdfs] embedding generation failed:', embErr.message);
|
|
}
|
|
}
|
|
|
|
const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(pdfId);
|
|
res.status(201).json({ ...row, used_vlm: usedVlm });
|
|
} catch (err) {
|
|
console.error('[pdfs] upload error:', err.message);
|
|
res.status(500).json({ error: err.message });
|
|
}
|
|
});
|
|
|
|
// GET /api/pdfs — list all with metadata + reorder_index
|
|
router.get('/', (req, res) => {
|
|
try {
|
|
const rows = db.prepare('SELECT id, filename, original_name, content_markdown, created_at, reorder_index, pages FROM pdfs ORDER BY reorder_index, id').all();
|
|
res.json(rows);
|
|
} catch (err) {
|
|
console.error('[pdfs] list error:', err.message);
|
|
res.status(500).json({ error: err.message });
|
|
}
|
|
});
|
|
|
|
// GET /api/pdfs/:id — full markdown
|
|
router.get('/:id', (req, res) => {
|
|
const id = parseInt(req.params.id, 10);
|
|
if (Number.isNaN(id)) {
|
|
return res.status(400).json({ error: 'Invalid pdf id' });
|
|
}
|
|
|
|
try {
|
|
const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(id);
|
|
if (!row) {
|
|
return res.status(404).json({ error: 'PDF not found' });
|
|
}
|
|
res.json(row);
|
|
} catch (err) {
|
|
console.error('[pdfs] get error:', err.message);
|
|
res.status(500).json({ error: err.message });
|
|
}
|
|
});
|
|
|
|
// PUT /api/pdfs/:id/reorder — update reorder_index
|
|
router.put('/:id/reorder', (req, res) => {
|
|
const id = parseInt(req.params.id, 10);
|
|
if (Number.isNaN(id)) {
|
|
return res.status(400).json({ error: 'Invalid pdf id' });
|
|
}
|
|
|
|
const { reorder_index } = req.body;
|
|
if (reorder_index === undefined) {
|
|
return res.status(400).json({ error: 'reorder_index is required' });
|
|
}
|
|
|
|
try {
|
|
const info = db.prepare('UPDATE pdfs SET reorder_index = ? WHERE id = ?').run(reorder_index, id);
|
|
if (info.changes === 0) {
|
|
return res.status(404).json({ error: 'PDF not found' });
|
|
}
|
|
res.json({ id, reorder_index });
|
|
} catch (err) {
|
|
console.error('[pdfs] reorder error:', err.message);
|
|
res.status(500).json({ error: err.message });
|
|
}
|
|
});
|
|
|
|
// DELETE /api/pdfs/:id
|
|
router.delete('/:id', (req, res) => {
|
|
const id = parseInt(req.params.id, 10);
|
|
if (Number.isNaN(id)) {
|
|
return res.status(400).json({ error: 'Invalid pdf id' });
|
|
}
|
|
|
|
try {
|
|
const row = db.prepare('SELECT filename FROM pdfs WHERE id = ?').get(id);
|
|
if (!row) {
|
|
return res.status(404).json({ error: 'PDF not found' });
|
|
}
|
|
|
|
// Delete file from disk
|
|
const filePath = path.join(uploadDir, row.filename);
|
|
try { fs.unlinkSync(filePath); } catch {}
|
|
|
|
// Delete embeddings first
|
|
db.prepare('DELETE FROM embeddings WHERE pdf_id = ?').run(id);
|
|
db.prepare('DELETE FROM pdfs WHERE id = ?').run(id);
|
|
res.json({ deleted: true });
|
|
} catch (err) {
|
|
console.error('[pdfs] delete error:', err.message);
|
|
res.status(500).json({ error: err.message });
|
|
}
|
|
});
|
|
|
|
// GET /api/pdfs/:id/embeddings — list embeddings for a PDF (debug)
|
|
router.get('/:id/embeddings', (req, res) => {
|
|
const id = parseInt(req.params.id, 10);
|
|
if (Number.isNaN(id)) {
|
|
return res.status(400).json({ error: 'Invalid pdf id' });
|
|
}
|
|
|
|
try {
|
|
const pdf = db.prepare('SELECT id FROM pdfs WHERE id = ?').get(id);
|
|
if (!pdf) {
|
|
return res.status(404).json({ error: 'PDF not found' });
|
|
}
|
|
|
|
const rows = db.prepare('SELECT id, pdf_id, chunk_index, content, created_at FROM embeddings WHERE pdf_id = ? ORDER BY chunk_index')
|
|
.all(id);
|
|
res.json({ pdf_id: id, count: rows.length, chunks: rows });
|
|
} catch (err) {
|
|
console.error('[pdfs] embeddings error:', err.message);
|
|
res.status(500).json({ error: err.message });
|
|
}
|
|
});
|
|
|
|
module.exports = router;
|