| import express from 'express'; |
| import axios from 'axios'; |
| import { Readability } from '@mozilla/readability'; |
| import { JSDOM } from 'jsdom'; |
| import TurndownService from 'turndown'; |
|
|
|
|
| const turndownService = new TurndownService({ |
| headingStyle: 'atx', |
| codeBlockStyle: 'fenced' |
| }); |
|
|
| class WebsiteParser { |
| async fetchAndParse(url) { |
| try { |
| const response = await axios.get(url, { |
| headers: { |
| 'User-Agent': 'Mozilla/5.0 (compatible; MCPBot/1.0)' |
| } |
| }); |
|
|
| const dom = new JSDOM(response.data, { url }); |
| const document = dom.window.document; |
|
|
| const reader = new Readability(document); |
| const article = reader.parse(); |
|
|
| if (!article) { |
| throw new Error('не удалось спарсить страницу'); |
| } |
|
|
| const markdown = turndownService.turndown(article.content); |
|
|
| return { |
| title: article.title, |
| content: markdown, |
| excerpt: article.excerpt, |
| byline: article.byline, |
| siteName: article.siteName |
| }; |
| } catch (error) { |
| throw new Error(`ошибка парсинга или получения страницы: ${error.message}`); |
| } |
| } |
| } |
|
|
| const app = express(); |
| const PORT = process.env.PORT || 7860; |
| const parser = new WebsiteParser(); |
|
|
| app.use(express.json()); |
|
|
| app.post('/parse', async (req, res) => { |
| try { |
| const { url } = req.body; |
|
|
| if (!url) { |
| return res.status(400).json({ error: 'необходимо указать URL!' }); |
| } |
|
|
| const result = await parser.fetchAndParse(url); |
| |
| res.json({ |
| title: result.title, |
| content: result.content, |
| metadata: { |
| excerpt: result.excerpt, |
| byline: result.byline, |
| siteName: result.siteName |
| } |
| }); |
| } catch (error) { |
| res.status(500).json({ |
| error: error.message |
| }); |
| } |
| }); |
|
|
| app.get('/', (req, res) => { |
| res.send(`curl -X POST https://prolapse-read.hf.space/parse -H "Content-Type: application/json" -d '{"url": "https://habr.com/ru/companies/serverspace/articles/869252/"}'`); |
| }); |
|
|
| app.listen(PORT, () => { |
| console.log(`Server running on port ${PORT}`); |
| }); |
|
|