"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; var _a; Object.defineProperty(exports, "__esModule", { value: true }); exports.SnapshotFormatter = exports.md5Hasher = void 0; exports.highlightedCodeBlock = highlightedCodeBlock; const crypto_1 = require("crypto"); const tsyringe_1 = require("tsyringe"); const civkit_1 = require("civkit"); const turndown_1 = __importDefault(require("turndown")); const logger_1 = require("./logger"); const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket"); const async_context_1 = require("../shared/services/async-context"); const threaded_1 = require("../services/threaded"); const jsdom_1 = require("./jsdom"); const alt_text_1 = require("./alt-text"); const pdf_extract_1 = require("./pdf-extract"); const misc_1 = require("../utils/misc"); const lodash_1 = __importDefault(require("lodash")); const http_1 = require("http"); const encoding_1 = require("../utils/encoding"); const url_1 = require("url"); const openai_1 = require("../shared/utils/openai"); exports.md5Hasher = new civkit_1.HashManager('md5', 'hex'); const gfmPlugin = require('turndown-plugin-gfm'); const highlightRegExp = /highlight-(?:text|source)-([a-z0-9]+)/; function highlightedCodeBlock(turndownService) { turndownService.addRule('highlightedCodeBlock', { filter: (node) => { return (node.nodeName === 'DIV' && node.firstChild?.nodeName === 'PRE' && highlightRegExp.test(node.className)); }, replacement: (_content, node, options) => { const className = node.className || ''; const language = (className.match(highlightRegExp) || [null, ''])[1]; return ('\n\n' + options.fence + language + '\n' + node.firstChild.textContent + '\n' + options.fence + '\n\n'); } }); } let SnapshotFormatter = class SnapshotFormatter extends civkit_1.AsyncService { constructor(globalLogger, jsdomControl, altTextService, pdfExtractor, threadLocal, firebaseObjectStorage) { super(...arguments); this.globalLogger = globalLogger; this.jsdomControl = jsdomControl; this.altTextService = altTextService; this.pdfExtractor = pdfExtractor; this.threadLocal = threadLocal; this.firebaseObjectStorage = firebaseObjectStorage; this.logger = this.globalLogger.child({ service: this.constructor.name }); this.gfmPlugin = [gfmPlugin.tables, highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems]; this.gfmNoTable = [highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems]; } async init() { await this.dependencyReady(); this.emit('ready'); } async formatSnapshot(mode, snapshot, nominalUrl, urlValidMs = 3600 * 1000 * 4) { const t0 = Date.now(); const f = { ...(await this.getGeneralSnapshotMixins(snapshot)), }; let modeOK = false; if (mode.includes('screenshot')) { modeOK = true; if (snapshot.screenshot && !snapshot.screenshotUrl) { const fid = `instant-screenshots/${(0, crypto_1.randomUUID)()}`; await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, { metadata: { contentType: 'image/png', } }); snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs); } Object.assign(f, { screenshotUrl: snapshot.screenshotUrl, }); Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false, configurable: true }); } if (mode.includes('pageshot')) { modeOK = true; if (snapshot.pageshot && !snapshot.pageshotUrl) { const fid = `instant-screenshots/${(0, crypto_1.randomUUID)()}`; await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, { metadata: { contentType: 'image/png', } }); snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs); } Object.assign(f, { html: snapshot.html, pageshotUrl: snapshot.pageshotUrl, }); Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false, configurable: true }); } if (mode.includes('html')) { modeOK = true; Object.assign(f, { html: snapshot.html, }); Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false, configurable: true }); } let pdfMode = false; // in case of Google Web Cache content if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) { const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0], this.threadLocal.get('cacheTolerance'), snapshot.pdfs[0].startsWith('http') ? undefined : snapshot.href); if (pdf) { pdfMode = true; snapshot.title = pdf.meta?.Title; snapshot.text = pdf.text || snapshot.text; snapshot.parsed = { content: pdf.content, textContent: pdf.content, length: pdf.content?.length, byline: pdf.meta?.Author, lang: pdf.meta?.Language || undefined, title: pdf.meta?.Title, publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(), }; } } if (mode.includes('text')) { modeOK = true; Object.assign(f, { text: snapshot.text, }); Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true }); } if (mode.includes('lm')) { modeOK = true; f.content = snapshot.parsed?.textContent; } if (modeOK && (mode.includes('lm') || (!mode.includes('markdown') && !mode.includes('content')))) { const dt = Date.now() - t0; this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), description: (snapshot.description || '').trim(), url: nominalUrl?.toString() || snapshot.href?.trim(), publishedTime: snapshot.parsed?.publishedTime || undefined, }; Object.assign(f, formatted); return f; } const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl')); let contentText = ''; const imageSummary = {}; const imageIdxTrack = new Map(); const uid = this.threadLocal.get('uid'); do { if (pdfMode) { contentText = (snapshot.parsed?.content || snapshot.text || '').trim(); break; } if (snapshot.maxElemDepth > 256 || (!uid && snapshot.elemCount > 10_000) || snapshot.elemCount > 80_000) { this.logger.warn('Degrading to text to protect the server', { url: snapshot.href, elemDepth: snapshot.maxElemDepth, elemCount: snapshot.elemCount }); contentText = (snapshot.text || '').trimEnd(); break; } const noGFMOpts = this.threadLocal.get('noGfm'); const imageRetention = this.threadLocal.get('retainImages'); let imgIdx = 0; const urlToAltMap = {}; const customRules = { 'img-retention': { filter: 'img', replacement: (_content, node) => { if (imageRetention === 'none') { return ''; } const alt = (0, misc_1.cleanAttribute)(node.getAttribute('alt')); if (imageRetention === 'alt') { return alt ? `(Image ${++imgIdx}: ${alt})` : ''; } const originalSrc = (node.getAttribute('src') || '').trim(); let linkPreferredSrc = originalSrc; const maybeSrcSet = (node.getAttribute('srcset') || '').trim(); if (!linkPreferredSrc && maybeSrcSet) { linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0]; } if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) { const dataSrc = (node.getAttribute('data-src') || '').trim(); if (dataSrc && !dataSrc.startsWith('data:')) { linkPreferredSrc = dataSrc; } } let src; try { src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString(); } catch (_err) { void 0; } if (!src) { return ''; } const keySrc = (originalSrc.startsWith('data:') ? this.dataUrlToBlobUrl(originalSrc, snapshot.rebase) : src).trim(); const mapped = urlToAltMap[keySrc]; const imgSerial = ++imgIdx; const idxArr = imageIdxTrack.has(keySrc) ? imageIdxTrack.get(keySrc) : []; idxArr.push(imgSerial); imageIdxTrack.set(keySrc, idxArr); if (mapped) { imageSummary[keySrc] = mapped || alt; if (imageRetention === 'alt_p') { return `(Image ${imgSerial}: ${mapped || alt})`; } if (imgDataUrlToObjectUrl) { return `![Image ${imgSerial}: ${mapped || alt}](${keySrc})`; } return `![Image ${imgSerial}: ${mapped || alt}](${src})`; } else if (imageRetention === 'alt_p') { return alt ? `(Image ${imgSerial}: ${alt})` : ''; } imageSummary[keySrc] = alt || ''; if (imgDataUrlToObjectUrl) { return alt ? `![Image ${imgSerial}: ${alt}](${keySrc})` : `![Image ${imgSerial}](${keySrc})`; } return alt ? `![Image ${imgSerial}: ${alt}](${src})` : `![Image ${imgSerial}](${src})`; } } }; const optsMixin = { url: snapshot.rebase || nominalUrl, customRules, customKeep: noGFMOpts === 'table' ? 'table' : undefined, imgDataUrlToObjectUrl, }; const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href); let toBeTurnedToMd = jsDomElementOfHTML; let turnDownService = this.getTurndown({ ...optsMixin }); if (!mode.includes('markdown') && snapshot.parsed?.content) { const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); imgIdx = 0; const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : ''; // If Readability did its job if (par2.length >= 0.3 * par1.length) { turnDownService = this.getTurndown({ noRules: true, ...optsMixin }); imgIdx = 0; if (snapshot.parsed.content) { toBeTurnedToMd = jsDomElementOfParsed; } } } if (!noGFMOpts) { turnDownService = turnDownService.use(noGFMOpts === 'table' ? this.gfmNoTable : this.gfmPlugin); } // _p is the special suffix for withGeneratedAlt if (snapshot.imgs?.length && imageRetention?.endsWith('_p')) { const tasks = lodash_1.default.uniqBy((snapshot.imgs || []), 'src').map(async (x) => { const r = await this.altTextService.getAltText(x).catch((err) => { this.logger.warn(`Failed to get alt text for ${x.src}`, { err: (0, civkit_1.marshalErrorLike)(err) }); return undefined; }); if (r && x.src) { // note x.src here is already rebased to absolute url by browser/upstream. const keySrc = (x.src.startsWith('data:') ? this.dataUrlToBlobUrl(x.src, snapshot.rebase) : x.src).trim(); urlToAltMap[keySrc] = r; } }); await Promise.all(tasks); } if (toBeTurnedToMd) { try { contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); imgIdx = 0; } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); const vanillaTurnDownService = this.getTurndown({ ...optsMixin }); try { contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); imgIdx = 0; } catch (err2) { this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); } } } if (this.isPoorlyTransformed(contentText, toBeTurnedToMd) && toBeTurnedToMd !== jsDomElementOfHTML) { toBeTurnedToMd = jsDomElementOfHTML; try { contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim(); imgIdx = 0; } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); const vanillaTurnDownService = this.getTurndown({ ...optsMixin }); try { contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim(); imgIdx = 0; } catch (err2) { this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); } } } if (mode === 'content' && this.isPoorlyTransformed(contentText, toBeTurnedToMd)) { contentText = (snapshot.text || '').trimEnd(); } } while (false); const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), description: (snapshot.description || '').trim(), url: nominalUrl?.toString() || snapshot.href?.trim(), content: contentText, publishedTime: snapshot.parsed?.publishedTime || undefined, }; if (snapshot.status) { const code = snapshot.status; const n = code - 200; if (n < 0 || n >= 200) { const text = snapshot.statusText || http_1.STATUS_CODES[code]; formatted.warning ??= ''; const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } } if (this.threadLocal.get('withImagesSummary')) { formatted.images = (0, lodash_1.default)(imageSummary) .toPairs() .map(([url, alt], i) => { const idxTrack = imageIdxTrack.get(url); const tag = idxTrack?.length ? `Image ${lodash_1.default.uniq(idxTrack).join(',')}` : `Hidden Image ${i + 1}`; return [`${tag}${alt ? `: ${alt}` : ''}`, url]; }).fromPairs() .value(); } if (this.threadLocal.get('withLinksSummary')) { const links = (await this.jsdomControl.inferSnapshot(snapshot)).links; if (this.threadLocal.get('withLinksSummary') === 'all') { formatted.links = links; } else { formatted.links = (0, lodash_1.default)(links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value(); } } if ((0, openai_1.countGPTToken)(formatted.content) < 200) { formatted.warning ??= ''; if (snapshot.isIntermediate) { const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.'; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) { const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.'; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) { const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.'; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) { const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.'; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.isFromCache) { const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.'; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } } Object.assign(f, formatted); const textRepresentation = (function () { const mixins = []; if (this.publishedTime) { mixins.push(`Published Time: ${this.publishedTime}`); } const suffixMixins = []; if (this.images) { const imageSummaryChunks = ['Images:']; for (const [k, v] of Object.entries(this.images)) { imageSummaryChunks.push(`- ![${k}](${v})`); } if (imageSummaryChunks.length === 1) { imageSummaryChunks.push('This page does not seem to contain any images.'); } suffixMixins.push(imageSummaryChunks.join('\n')); } if (this.links) { const linkSummaryChunks = ['Links/Buttons:']; if (Array.isArray(this.links)) { for (const [k, v] of this.links) { linkSummaryChunks.push(`- [${k}](${v})`); } } else { for (const [k, v] of Object.entries(this.links)) { linkSummaryChunks.push(`- [${k}](${v})`); } } if (linkSummaryChunks.length === 1) { linkSummaryChunks.push('This page does not seem to contain any buttons/links.'); } suffixMixins.push(linkSummaryChunks.join('\n')); } if (this.warning) { mixins.push(this.warning.split('\n').map((v) => `Warning: ${v}`).join('\n')); } if (mode.includes('markdown')) { return `${mixins.length ? `${mixins.join('\n\n')}\n\n` : ''}${this.content} ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } return `Title: ${this.title} URL Source: ${this.url} ${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''} Markdown Content: ${this.content} ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; }).call(formatted); Object.defineProperty(f, 'textRepresentation', { value: textRepresentation, enumerable: false }); const dt = Date.now() - t0; this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); return f; } dataUrlToBlobUrl(dataUrl, baseUrl = 'http://localhost/') { const refUrl = new URL(baseUrl); const mappedUrl = new URL(`blob:${refUrl.origin || 'localhost'}/${exports.md5Hasher.hash(dataUrl)}`); return mappedUrl.href; } async getGeneralSnapshotMixins(snapshot) { let inferred; const mixin = {}; if (this.threadLocal.get('withImagesSummary')) { inferred ??= await this.jsdomControl.inferSnapshot(snapshot); const imageSummary = {}; const imageIdxTrack = new Map(); let imgIdx = 0; for (const img of inferred.imgs) { const imgSerial = ++imgIdx; const keySrc = (img.src.startsWith('data:') ? this.dataUrlToBlobUrl(img.src, snapshot.rebase) : img.src).trim(); const idxArr = imageIdxTrack.has(keySrc) ? imageIdxTrack.get(keySrc) : []; idxArr.push(imgSerial); imageIdxTrack.set(keySrc, idxArr); imageSummary[keySrc] = img.alt || ''; } mixin.images = (0, lodash_1.default)(imageSummary) .toPairs() .map(([url, alt], i) => { const idxTrack = imageIdxTrack.get(url); const tag = idxTrack?.length ? `Image ${lodash_1.default.uniq(idxTrack).join(',')}` : `Hidden Image ${i + 1}`; return [`${tag}${alt ? `: ${alt}` : ''}`, url]; }).fromPairs() .value(); } if (this.threadLocal.get('withLinksSummary')) { inferred ??= await this.jsdomControl.inferSnapshot(snapshot); if (this.threadLocal.get('withLinksSummary') === 'all') { mixin.links = inferred.links; } else { mixin.links = (0, lodash_1.default)(inferred.links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value(); } } if (snapshot.status) { const code = snapshot.status; const n = code - 200; if (n < 0 || n >= 200) { const text = snapshot.statusText || http_1.STATUS_CODES[code]; mixin.warning ??= ''; const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; mixin.warning = `${mixin.warning}${mixin.warning ? '\n' : ''}${msg}`; } } return mixin; } getTurndown(options) { const turndownOpts = this.threadLocal.get('turndownOpts'); const turnDownService = new turndown_1.default({ ...turndownOpts, codeBlockStyle: 'fenced', preformattedCode: true, }); if (options?.customKeep) { turnDownService.keep(options.customKeep); } if (!options?.noRules) { turnDownService.addRule('remove-irrelevant', { filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'], replacement: () => '' }); turnDownService.addRule('truncate-svg', { filter: 'svg', replacement: () => '' }); turnDownService.addRule('title-as-h1', { filter: ['title'], replacement: (innerText) => `${innerText}\n===============\n` }); } if (options?.imgDataUrlToObjectUrl) { turnDownService.addRule('data-url-to-pseudo-object-url', { filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')), replacement: (_content, node) => { const src = (node.getAttribute('src') || '').trim(); const alt = (0, misc_1.cleanAttribute)(node.getAttribute('alt')) || ''; const blobUrl = this.dataUrlToBlobUrl(src, options.url?.toString()); return `![${alt}](${blobUrl})`; } }); } if (options?.customRules) { for (const [k, v] of Object.entries(options.customRules)) { turnDownService.addRule(k, v); } } turnDownService.addRule('improved-heading', { filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], replacement: (content, node, options) => { const hLevel = Number(node.nodeName.charAt(1)); if (options.headingStyle === 'setext' && hLevel < 3) { const underline = lodash_1.default.repeat((hLevel === 1 ? '=' : '-'), Math.min(128, content.length)); return ('\n\n' + content + '\n' + underline + '\n\n'); } else { return '\n\n' + lodash_1.default.repeat('#', hLevel) + ' ' + content + '\n\n'; } } }); turnDownService.addRule('improved-paragraph', { filter: 'p', replacement: (innerText) => { const trimmed = innerText.trim(); if (!trimmed) { return ''; } return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`; } }); let realLinkStyle = 'inlined'; if (turndownOpts?.linkStyle === 'referenced' || turndownOpts?.linkReferenceStyle) { realLinkStyle = 'referenced'; if (turndownOpts?.linkReferenceStyle === 'collapsed') { realLinkStyle = 'collapsed'; } else if (turndownOpts?.linkReferenceStyle === 'shortcut') { realLinkStyle = 'shortcut'; } else if (turndownOpts?.linkReferenceStyle === 'discarded') { realLinkStyle = 'discarded'; } } else if (turndownOpts?.linkStyle === 'discarded') { realLinkStyle = 'discarded'; } turnDownService.addRule('improved-link', { filter: function (node, _options) { return Boolean(node.nodeName === 'A' && node.getAttribute('href')); }, replacement: function (content, node) { var href = node.getAttribute('href'); let title = (0, misc_1.cleanAttribute)(node.getAttribute('title')); if (title) title = ` "${title.replace(/"/g, '\\"')}"`; let replacement; let reference; const fixedContent = content.replace(/\s+/g, ' ').trim(); let fixedHref = href; if (options?.url) { try { fixedHref = new URL(fixedHref, options.url).toString(); } catch (_err) { void 0; } } switch (realLinkStyle) { case 'inlined': replacement = `[${fixedContent}](${fixedHref}${title || ''})`; reference = undefined; break; case 'collapsed': replacement = `[${fixedContent}][]`; reference = `[${fixedContent}]: ${fixedHref}${title}`; break; case 'shortcut': replacement = `[${fixedContent}]`; reference = `[${fixedContent}]: ${fixedHref}${title}`; break; case 'discarded': replacement = content; reference = undefined; break; default: const id = this.references.length + 1; replacement = `[${fixedContent}][${id}]`; reference = `[${id}]${fixedHref}${title}`; } if (reference) { this.references.push(reference); } return replacement; }, // @ts-ignore references: [], append: function () { let references = ''; if (this.references.length) { references = `\n\n${this.references.join('\n')}\n\n`; this.references = []; // Reset references } return references; } }); turnDownService.addRule('improved-code', { filter: function (node) { let hasSiblings = node.previousSibling || node.nextSibling; let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings; return node.nodeName === 'CODE' && !isCodeBlock; }, replacement: function (inputContent) { if (!inputContent) return ''; let content = inputContent; let delimiter = '`'; let matches = content.match(/`+/gm) || []; while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'; if (content.includes('\n')) { delimiter = '```'; } let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''; return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter; } }); return turnDownService; } isPoorlyTransformed(content, node) { if (!content) { return true; } if (content.startsWith('<') && content.endsWith('>')) { return true; } if (!this.threadLocal.get('noGfm') && content.includes('')) { if (node?.textContent && content.length > node.textContent.length * 0.8) { return true; } const tableElms = node?.querySelectorAll('table') || []; const deepTableElms = node?.querySelectorAll('table table'); if (node && tableElms.length) { const wrappingTables = lodash_1.default.without(tableElms, ...Array.from(deepTableElms || [])); const tableTextsLength = lodash_1.default.sum(wrappingTables.map((x) => (x.innerHTML?.length || 0))); if (tableTextsLength / (content.length) > 0.6) { return true; } } const tbodyElms = node?.querySelectorAll('tbody') || []; const deepTbodyElms = node?.querySelectorAll('tbody tbody'); if ((deepTbodyElms?.length || 0) / tbodyElms.length > 0.6) { return true; } } return false; } async createSnapshotFromFile(url, file, overrideContentType, overrideFileName) { if (overrideContentType === 'application/octet-stream') { overrideContentType = undefined; } const contentType = (overrideContentType || await file.mimeType).toLowerCase(); const fileName = overrideFileName || `${url.origin}${url.pathname}`; const snapshot = { title: '', href: url.href, html: '', text: '' }; if (contentType.startsWith('image/')) { snapshot.html = `${fileName}`; snapshot.title = fileName; snapshot.imgs = [{ src: url.href }]; return snapshot; } try { const encoding = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8'; if (contentType.startsWith('text/html')) { if ((await file.size) > 1024 * 1024 * 32) { throw new civkit_1.AssertionFailureError(`Failed to access ${url}: file too large`); } snapshot.html = await (0, encoding_1.readFile)(await file.filePath, encoding); let innerCharset; const peek = snapshot.html.slice(0, 1024); innerCharset ??= peek.match(/]+text\/html;\s*?charset=([^>"]+)/i)?.[1]?.toLowerCase(); innerCharset ??= peek.match(/]+charset="([^>"]+)\"/i)?.[1]?.toLowerCase(); if (innerCharset && innerCharset !== encoding) { snapshot.html = await (0, encoding_1.readFile)(await file.filePath, innerCharset); } return snapshot; } if (contentType.startsWith('text/') || contentType.startsWith('application/json')) { if ((await file.size) > 1024 * 1024 * 32) { throw new civkit_1.AssertionFailureError(`Failed to access ${url}: file too large`); } snapshot.text = await (0, encoding_1.readFile)(await file.filePath, encoding); snapshot.html = `
${snapshot.text}
`; return snapshot; } if (contentType.startsWith('application/pdf')) { snapshot.pdfs = [(0, url_1.pathToFileURL)(await file.filePath).href]; return snapshot; } } catch (err) { this.logger.warn(`Failed to read from file: ${url}`, { err, url }); throw new civkit_1.DataStreamBrokenError(`Failed to access ${url}: ${err?.message}`); } throw new civkit_1.AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`); } }; exports.SnapshotFormatter = SnapshotFormatter; __decorate([ (0, threaded_1.Threaded)(), __metadata("design:type", Function), __metadata("design:paramtypes", [String, Object, typeof (_a = typeof URL !== "undefined" && URL) === "function" ? _a : Object, Object]), __metadata("design:returntype", Promise) ], SnapshotFormatter.prototype, "formatSnapshot", null); exports.SnapshotFormatter = SnapshotFormatter = __decorate([ (0, tsyringe_1.singleton)(), __metadata("design:paramtypes", [logger_1.GlobalLogger, jsdom_1.JSDomControl, alt_text_1.AltTextService, pdf_extract_1.PDFExtractor, async_context_1.AsyncContext, firebase_storage_bucket_1.FirebaseStorageBucketControl]) ], SnapshotFormatter); const snapshotFormatter = tsyringe_1.container.resolve(SnapshotFormatter); exports.default = snapshotFormatter; //# sourceMappingURL=snapshot-formatter.js.map