const express = require('express'); const multer = require('multer'); const path = require('path'); const fs = require('fs'); const db = require('../db'); const { chunkText } = require('../lib/rag'); const embeddings = require('../lib/embeddings'); const router = express.Router(); const uploadDir = path.resolve(__dirname, '..', '..', 'data', 'uploads'); if (!fs.existsSync(uploadDir)) { fs.mkdirSync(uploadDir, { recursive: true }); } const storage = multer.diskStorage({ destination: (req, file, cb) => cb(null, uploadDir), filename: (req, file, cb) => { const unique = Date.now() + '-' + Math.round(Math.random() * 1e9); cb(null, unique + path.extname(file.originalname)); }, }); const upload = multer({ storage, limits: { fileSize: 50 * 1024 * 1024 } }); // 50MB async function extractPDFText(filePath) { try { const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs'); const data = new Uint8Array(fs.readFileSync(filePath)); const doc = await pdfjsLib.getDocument({ data }).promise; const texts = []; for (let i = 1; i <= doc.numPages; i++) { const page = await doc.getPage(i); const textContent = await page.getTextContent(); const pageText = textContent.items.map(item => item.str).join(' '); texts.push(pageText); } return { text: texts.join('\n\n---\n\n'), pages: doc.numPages }; } catch (err) { console.error('[pdfs] pdfjs extract error:', err.message); return { text: '', pages: 0 }; } } async function vlmExtract(filePath, vlmConfig) { try { const buffer = fs.readFileSync(filePath); const base64 = buffer.toString('base64'); const mimeType = 'application/pdf'; const baseUrl = vlmConfig.endpoint.replace(/\/+$/, ''); const resp = await fetch(`${baseUrl}/chat/completions`, { method: 'POST', headers: { 'Authorization': `Bearer ${vlmConfig.api_key}`, 'Content-Type': 'application/json', }, body: JSON.stringify({ model: vlmConfig.model || 'glm-4.6v', messages: [{ role: 'user', content: [ { type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64}` } }, { type: 'text', text: 'Extract all text from this document as markdown. Preserve structure: headers, lists, paragraphs, tables. Be thorough.' }, ], }], max_tokens: 4096, }), }); if (!resp.ok) { const errText = await resp.text().catch(() => ''); throw new Error(`VLM HTTP ${resp.status}: ${errText.substring(0, 200)}`); } const data = await resp.json(); return data.choices?.[0]?.message?.content || data.text || data.markdown || ''; } catch (err) { console.error('[pdfs] VLM error:', err.message); return null; } } // POST /api/pdfs/upload router.post('/upload', upload.single('file'), async (req, res) => { if (!req.file) { return res.status(400).json({ error: 'No file uploaded' }); } try { const filePath = req.file.path; let text = ''; let pages = 0; let usedVlm = false; // 1. Try VLM first const configs = db.prepare("SELECT key, value FROM config WHERE key IN ('vlm_endpoint','vlm_api_key','vlm_model')").all(); const vlmEndpoint = configs.find(c => c.key === 'vlm_endpoint'); const vlmApiKey = configs.find(c => c.key === 'vlm_api_key'); const vlmModel = configs.find(c => c.key === 'vlm_model'); if (vlmEndpoint?.value && vlmApiKey?.value) { const vlmConfig = { endpoint: vlmEndpoint.value, api_key: vlmApiKey.value, model: vlmModel?.value || 'glm-4.6v', }; text = await vlmExtract(filePath, vlmConfig); if (text) { usedVlm = true; // VLM succeeded — still get page count from PDF metadata try { const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs'); const data = new Uint8Array(fs.readFileSync(filePath)); const doc = await pdfjsLib.getDocument({ data }).promise; pages = doc.numPages; } catch { /* keep pages = 0 if metadata read fails */ } } } // 2. Fallback to pdfjs-dist if (!text || text.trim().length === 0) { const extracted = await extractPDFText(filePath); text = extracted.text; pages = extracted.pages; } const maxReorder = db.prepare('SELECT MAX(reorder_index) as maxIdx FROM pdfs').get(); const reorderIndex = (maxReorder?.maxIdx ?? -1) + 1; const info = db.prepare(` INSERT INTO pdfs (filename, original_name, content_markdown, pages, reorder_index) VALUES (?, ?, ?, ?, ?) `).run(req.file.filename, req.file.originalname, text || '', pages || 0, reorderIndex); const pdfId = info.lastInsertRowid; // Generate embeddings for the PDF text if (text && text.trim().length > 0) { try { const chunks = chunkText(text, 500, 50); const vectors = await embeddings.embedBatch(chunks); for (let i = 0; i < chunks.length; i++) { const vec = vectors[i]; const blob = Buffer.from(vec.buffer); db.prepare( 'INSERT INTO embeddings (pdf_id, chunk_index, vector, content) VALUES (?, ?, ?, ?)' ).run(pdfId, i, blob, chunks[i]); } } catch (embErr) { console.warn('[pdfs] embedding generation failed:', embErr.message); } } const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(pdfId); res.status(201).json({ ...row, used_vlm: usedVlm }); } catch (err) { console.error('[pdfs] upload error:', err.message); res.status(500).json({ error: err.message }); } }); // GET /api/pdfs — list all with metadata + reorder_index router.get('/', (req, res) => { try { const rows = db.prepare('SELECT id, filename, original_name, content_markdown, created_at, reorder_index, pages FROM pdfs ORDER BY reorder_index, id').all(); res.json(rows); } catch (err) { console.error('[pdfs] list error:', err.message); res.status(500).json({ error: err.message }); } }); // GET /api/pdfs/:id — full markdown router.get('/:id', (req, res) => { const id = parseInt(req.params.id, 10); if (Number.isNaN(id)) { return res.status(400).json({ error: 'Invalid pdf id' }); } try { const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(id); if (!row) { return res.status(404).json({ error: 'PDF not found' }); } res.json(row); } catch (err) { console.error('[pdfs] get error:', err.message); res.status(500).json({ error: err.message }); } }); // PUT /api/pdfs/:id/reorder — update reorder_index router.put('/:id/reorder', (req, res) => { const id = parseInt(req.params.id, 10); if (Number.isNaN(id)) { return res.status(400).json({ error: 'Invalid pdf id' }); } const { reorder_index } = req.body; if (reorder_index === undefined) { return res.status(400).json({ error: 'reorder_index is required' }); } try { const info = db.prepare('UPDATE pdfs SET reorder_index = ? WHERE id = ?').run(reorder_index, id); if (info.changes === 0) { return res.status(404).json({ error: 'PDF not found' }); } res.json({ id, reorder_index }); } catch (err) { console.error('[pdfs] reorder error:', err.message); res.status(500).json({ error: err.message }); } }); // DELETE /api/pdfs/:id router.delete('/:id', (req, res) => { const id = parseInt(req.params.id, 10); if (Number.isNaN(id)) { return res.status(400).json({ error: 'Invalid pdf id' }); } try { const row = db.prepare('SELECT filename FROM pdfs WHERE id = ?').get(id); if (!row) { return res.status(404).json({ error: 'PDF not found' }); } // Delete file from disk const filePath = path.join(uploadDir, row.filename); try { fs.unlinkSync(filePath); } catch {} // Delete embeddings first db.prepare('DELETE FROM embeddings WHERE pdf_id = ?').run(id); db.prepare('DELETE FROM pdfs WHERE id = ?').run(id); res.json({ deleted: true }); } catch (err) { console.error('[pdfs] delete error:', err.message); res.status(500).json({ error: err.message }); } }); // GET /api/pdfs/:id/embeddings — list embeddings for a PDF (debug) router.get('/:id/embeddings', (req, res) => { const id = parseInt(req.params.id, 10); if (Number.isNaN(id)) { return res.status(400).json({ error: 'Invalid pdf id' }); } try { const pdf = db.prepare('SELECT id FROM pdfs WHERE id = ?').get(id); if (!pdf) { return res.status(404).json({ error: 'PDF not found' }); } const rows = db.prepare('SELECT id, pdf_id, chunk_index, content, created_at FROM embeddings WHERE pdf_id = ? ORDER BY chunk_index') .all(id); res.json({ pdf_id: id, count: rows.length, chunks: rows }); } catch (err) { console.error('[pdfs] embeddings error:', err.message); res.status(500).json({ error: err.message }); } }); module.exports = router;