Files
studyos/server/routes/pdfs.js
2026-06-08 16:53:18 -03:00

212 lines
6.7 KiB
JavaScript

const express = require('express');
const multer = require('multer');
const path = require('path');
const fs = require('fs');
const db = require('../db');
const router = express.Router();
const uploadDir = path.resolve(__dirname, '..', '..', 'data', 'uploads');
if (!fs.existsSync(uploadDir)) {
fs.mkdirSync(uploadDir, { recursive: true });
}
const storage = multer.diskStorage({
destination: (req, file, cb) => cb(null, uploadDir),
filename: (req, file, cb) => {
const unique = Date.now() + '-' + Math.round(Math.random() * 1e9);
cb(null, unique + path.extname(file.originalname));
},
});
const upload = multer({ storage, limits: { fileSize: 50 * 1024 * 1024 } }); // 50MB
async function extractPDFText(filePath) {
try {
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
const data = new Uint8Array(fs.readFileSync(filePath));
const doc = await pdfjsLib.getDocument({ data }).promise;
const texts = [];
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items.map(item => item.str).join(' ');
texts.push(pageText);
}
return { text: texts.join('\n\n---\n\n'), pages: doc.numPages };
} catch (err) {
console.error('[pdfs] pdfjs extract error:', err.message);
return { text: '', pages: 0 };
}
}
async function vlmExtract(filePath, vlmConfig) {
try {
const buffer = fs.readFileSync(filePath);
const base64 = buffer.toString('base64');
const mimeType = 'application/pdf';
const resp = await fetch(`${vlmConfig.endpoint}/chat/completions`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${vlmConfig.api_key}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: vlmConfig.model || 'glm-4.6v',
messages: [{
role: 'user',
content: [
{ type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64}` } },
{ type: 'text', text: 'Extract all text from this document as markdown. Preserve structure: headers, lists, paragraphs, tables. Be thorough.' },
],
}],
max_tokens: 4096,
}),
});
if (!resp.ok) {
const errText = await resp.text().catch(() => '');
throw new Error(`VLM HTTP ${resp.status}: ${errText.substring(0, 200)}`);
}
const data = await resp.json();
return data.choices?.[0]?.message?.content || data.text || data.markdown || '';
} catch (err) {
console.error('[pdfs] VLM error:', err.message);
return null;
}
}
// POST /api/pdfs/upload
router.post('/upload', upload.single('file'), async (req, res) => {
if (!req.file) {
return res.status(400).json({ error: 'No file uploaded' });
}
try {
const filePath = req.file.path;
let text = '';
let pages = 0;
let usedVlm = false;
// 1. Try VLM first
const vlmEndpoint = db.prepare("SELECT value FROM config WHERE key = 'vlm_endpoint'").get();
const vlmApiKey = db.prepare("SELECT value FROM config WHERE key = 'vlm_api_key'").get();
const vlmModel = db.prepare("SELECT value FROM config WHERE key = 'vlm_model'").get();
if (vlmEndpoint?.value && vlmApiKey?.value) {
const vlmConfig = {
endpoint: vlmEndpoint.value,
api_key: vlmApiKey.value,
model: vlmModel?.value || 'glm-4.6v',
};
text = await vlmExtract(filePath, vlmConfig);
if (text) usedVlm = true;
}
// 2. Fallback to pdfjs-dist
if (!text || text.trim().length === 0) {
const extracted = await extractPDFText(filePath);
text = extracted.text;
pages = extracted.pages;
}
const maxReorder = db.prepare('SELECT MAX(reorder_index) as maxIdx FROM pdfs').get();
const reorderIndex = (maxReorder?.maxIdx ?? -1) + 1;
const info = db.prepare(`
INSERT INTO pdfs (filename, original_name, content_markdown, pages, reorder_index)
VALUES (?, ?, ?, ?, ?)
`).run(req.file.filename, req.file.originalname, text || '', pages || 0, reorderIndex);
const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(info.lastInsertRowid);
res.status(201).json({ ...row, used_vlm: usedVlm });
} catch (err) {
console.error('[pdfs] upload error:', err.message);
res.status(500).json({ error: err.message });
}
});
// GET /api/pdfs — list all with metadata + reorder_index
router.get('/', (req, res) => {
try {
const rows = db.prepare('SELECT id, filename, original_name, content_markdown, created_at, reorder_index, pages FROM pdfs ORDER BY reorder_index, id').all();
res.json(rows);
} catch (err) {
console.error('[pdfs] list error:', err.message);
res.status(500).json({ error: err.message });
}
});
// GET /api/pdfs/:id — full markdown
router.get('/:id', (req, res) => {
const id = parseInt(req.params.id, 10);
if (Number.isNaN(id)) {
return res.status(400).json({ error: 'Invalid pdf id' });
}
try {
const row = db.prepare('SELECT * FROM pdfs WHERE id = ?').get(id);
if (!row) {
return res.status(404).json({ error: 'PDF not found' });
}
res.json(row);
} catch (err) {
console.error('[pdfs] get error:', err.message);
res.status(500).json({ error: err.message });
}
});
// PUT /api/pdfs/:id/reorder — update reorder_index
router.put('/:id/reorder', (req, res) => {
const id = parseInt(req.params.id, 10);
if (Number.isNaN(id)) {
return res.status(400).json({ error: 'Invalid pdf id' });
}
const { reorder_index } = req.body;
if (reorder_index === undefined) {
return res.status(400).json({ error: 'reorder_index is required' });
}
try {
const info = db.prepare('UPDATE pdfs SET reorder_index = ? WHERE id = ?').run(reorder_index, id);
if (info.changes === 0) {
return res.status(404).json({ error: 'PDF not found' });
}
res.json({ id, reorder_index });
} catch (err) {
console.error('[pdfs] reorder error:', err.message);
res.status(500).json({ error: err.message });
}
});
// DELETE /api/pdfs/:id
router.delete('/:id', (req, res) => {
const id = parseInt(req.params.id, 10);
if (Number.isNaN(id)) {
return res.status(400).json({ error: 'Invalid pdf id' });
}
try {
const row = db.prepare('SELECT filename FROM pdfs WHERE id = ?').get(id);
if (!row) {
return res.status(404).json({ error: 'PDF not found' });
}
// Delete file from disk
const filePath = path.join(uploadDir, row.filename);
if (fs.existsSync(filePath)) {
fs.unlinkSync(filePath);
}
db.prepare('DELETE FROM pdfs WHERE id = ?').run(id);
res.json({ deleted: true });
} catch (err) {
console.error('[pdfs] delete error:', err.message);
res.status(500).json({ error: err.message });
}
});
module.exports = router;