Spaces:
Build error
Build error
| ; | |
| var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { | |
| var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; | |
| if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); | |
| else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; | |
| return c > 3 && r && Object.defineProperty(target, key, r), r; | |
| }; | |
| var __metadata = (this && this.__metadata) || function (k, v) { | |
| if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); | |
| }; | |
| Object.defineProperty(exports, "__esModule", { value: true }); | |
| exports.JSDomControl = void 0; | |
| const tsyringe_1 = require("tsyringe"); | |
| const logger_1 = require("./logger"); | |
| const readability_1 = require("@mozilla/readability"); | |
| const threaded_1 = require("../services/threaded"); | |
| const tailwind_classes_1 = require("../utils/tailwind-classes"); | |
| const openai_1 = require("../shared/utils/openai"); | |
| const async_service_1 = require("civkit/async-service"); | |
| const civ_rpc_1 = require("civkit/civ-rpc"); | |
| const pLinkedom = import('linkedom'); | |
| let JSDomControl = class JSDomControl extends async_service_1.AsyncService { | |
| constructor(globalLogger) { | |
| super(...arguments); | |
| this.globalLogger = globalLogger; | |
| this.logger = this.globalLogger.child({ service: this.constructor.name }); | |
| } | |
| async init() { | |
| await this.dependencyReady(); | |
| this.linkedom = await pLinkedom; | |
| this.emit('ready'); | |
| } | |
| async narrowSnapshot(snapshot, options) { | |
| if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe && !options?.withShadowDom) { | |
| return snapshot; | |
| } | |
| if (!snapshot?.html) { | |
| return snapshot; | |
| } | |
| try { | |
| // SideLoad contains native objects that cannot go through thread boundaries. | |
| return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined }); | |
| } | |
| catch (err) { | |
| this.logger.warn(`Error narrowing snapshot`, { err }); | |
| if (err instanceof civ_rpc_1.ApplicationError) { | |
| throw err; | |
| } | |
| throw new civ_rpc_1.AssertionFailureError(`Failed to process the page: ${err?.message}`); | |
| } | |
| } | |
| async actualNarrowSnapshot(snapshot, options) { | |
| const t0 = Date.now(); | |
| let sourceHTML = snapshot.html; | |
| if (options?.withShadowDom && snapshot.shadowExpanded) { | |
| sourceHTML = snapshot.shadowExpanded; | |
| } | |
| let jsdom = this.linkedom.parseHTML(sourceHTML); | |
| if (!jsdom.window.document.documentElement) { | |
| jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`); | |
| } | |
| const allNodes = []; | |
| jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); | |
| if (options?.withIframe) { | |
| jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => { | |
| const src = x.getAttribute('src'); | |
| const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src); | |
| if (options?.withIframe === 'quoted') { | |
| const blockquoteElem = jsdom.window.document.createElement('blockquote'); | |
| const preElem = jsdom.window.document.createElement('pre'); | |
| preElem.innerHTML = thisSnapshot?.text || ''; | |
| blockquoteElem.appendChild(preElem); | |
| x.replaceWith(blockquoteElem); | |
| } | |
| else if (thisSnapshot?.html) { | |
| x.innerHTML = thisSnapshot.html; | |
| x.querySelectorAll('script, style').forEach((s) => s.remove()); | |
| if (src) { | |
| x.querySelectorAll('[src]').forEach((el) => { | |
| const imgSrc = el.getAttribute('src'); | |
| if (URL.canParse(imgSrc, src)) { | |
| el.setAttribute('src', new URL(imgSrc, src).toString()); | |
| } | |
| }); | |
| x.querySelectorAll('[href]').forEach((el) => { | |
| const linkHref = el.getAttribute('href'); | |
| if (URL.canParse(linkHref, src)) { | |
| el.setAttribute('href', new URL(linkHref, src).toString()); | |
| } | |
| }); | |
| } | |
| } | |
| }); | |
| } | |
| if (Array.isArray(options?.removeSelector)) { | |
| for (const rl of options.removeSelector) { | |
| jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); | |
| } | |
| } | |
| else if (options?.removeSelector) { | |
| jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove()); | |
| } | |
| let bewareTargetContentDoesNotExist = false; | |
| if (Array.isArray(options?.targetSelector)) { | |
| bewareTargetContentDoesNotExist = true; | |
| for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { | |
| x.forEach((el) => { | |
| if (!allNodes.includes(el)) { | |
| allNodes.push(el); | |
| } | |
| }); | |
| } | |
| } | |
| else if (options?.targetSelector) { | |
| bewareTargetContentDoesNotExist = true; | |
| jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => { | |
| if (!allNodes.includes(el)) { | |
| allNodes.push(el); | |
| } | |
| }); | |
| } | |
| else { | |
| allNodes.push(jsdom.window.document); | |
| } | |
| if (!allNodes.length) { | |
| if (bewareTargetContentDoesNotExist) { | |
| return undefined; | |
| } | |
| return snapshot; | |
| } | |
| const textNodes = []; | |
| let rootDoc; | |
| if (allNodes.length === 1 && allNodes[0].nodeName === '#document' && allNodes[0].documentElement) { | |
| rootDoc = allNodes[0]; | |
| if (rootDoc.body?.innerText) { | |
| textNodes.push(rootDoc.body); | |
| } | |
| } | |
| else { | |
| rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document; | |
| for (const n of allNodes) { | |
| rootDoc.body.appendChild(n); | |
| rootDoc.body.appendChild(rootDoc.createTextNode('\n\n')); | |
| if (n.innerText) { | |
| textNodes.push(n); | |
| } | |
| } | |
| } | |
| const textChunks = textNodes.map((x) => { | |
| const clone = x.cloneNode(true); | |
| clone.querySelectorAll('script,style,link,svg').forEach((s) => s.remove()); | |
| return clone.innerText; | |
| }); | |
| let parsed; | |
| try { | |
| parsed = new readability_1.Readability(rootDoc.cloneNode(true)).parse(); | |
| } | |
| catch (err) { | |
| this.logger.warn(`Failed to parse selected element`, { err }); | |
| } | |
| const imgSet = new Set(); | |
| const rebuiltImgs = []; | |
| Array.from(rootDoc.querySelectorAll('img[src],img[data-src]')) | |
| .map((x) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')]) | |
| .forEach(([u1, u2, alt]) => { | |
| let absUrl; | |
| if (u1) { | |
| try { | |
| const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString(); | |
| imgSet.add(u1Txt); | |
| absUrl = u1Txt; | |
| } | |
| catch (err) { | |
| // void 0; | |
| } | |
| } | |
| if (u2) { | |
| try { | |
| const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString(); | |
| imgSet.add(u2Txt); | |
| absUrl = u2Txt; | |
| } | |
| catch (err) { | |
| // void 0; | |
| } | |
| } | |
| if (absUrl) { | |
| rebuiltImgs.push({ | |
| src: absUrl, | |
| alt | |
| }); | |
| } | |
| }); | |
| const r = { | |
| ...snapshot, | |
| title: snapshot.title || jsdom.window.document.title, | |
| description: snapshot.description || | |
| (jsdom.window.document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? ''), | |
| parsed, | |
| html: rootDoc.documentElement.outerHTML, | |
| text: textChunks.join('\n'), | |
| imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [], | |
| }; | |
| const dt = Date.now() - t0; | |
| if (dt > 1000) { | |
| this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt }); | |
| } | |
| return r; | |
| } | |
| async inferSnapshot(snapshot) { | |
| const t0 = Date.now(); | |
| const extendedSnapshot = { ...snapshot }; | |
| try { | |
| const jsdom = this.linkedom.parseHTML(snapshot.html); | |
| jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); | |
| const links = Array.from(jsdom.window.document.querySelectorAll('a[href]')) | |
| .map((x) => [x.textContent.replace(/\s+/g, ' ').trim(), x.getAttribute('href'),]) | |
| .map(([text, href]) => { | |
| if (!href) { | |
| return undefined; | |
| } | |
| try { | |
| const parsed = new URL(href, snapshot.rebase || snapshot.href); | |
| return [text, parsed.toString()]; | |
| } | |
| catch (err) { | |
| return undefined; | |
| } | |
| }) | |
| .filter(Boolean); | |
| extendedSnapshot.links = links; | |
| const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]')) | |
| .map((x) => { | |
| let linkPreferredSrc = x.getAttribute('src') || ''; | |
| if (linkPreferredSrc.startsWith('data:')) { | |
| const dataSrc = x.getAttribute('data-src') || ''; | |
| if (dataSrc && !dataSrc.startsWith('data:')) { | |
| linkPreferredSrc = dataSrc; | |
| } | |
| } | |
| return { | |
| src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(), | |
| width: parseInt(x.getAttribute('width') || '0'), | |
| height: parseInt(x.getAttribute('height') || '0'), | |
| alt: x.getAttribute('alt') || x.getAttribute('title'), | |
| }; | |
| }); | |
| extendedSnapshot.imgs = imgs; | |
| } | |
| catch (_err) { | |
| void 0; | |
| } | |
| const dt = Date.now() - t0; | |
| if (dt > 1000) { | |
| this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt }); | |
| } | |
| return extendedSnapshot; | |
| } | |
| cleanRedundantEmptyLines(text) { | |
| const lines = text.split(/\r?\n/g); | |
| const mappedFlag = lines.map((line) => Boolean(line.trim())); | |
| return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n'); | |
| } | |
| async cleanHTMLforLMs(sourceHTML, ...discardSelectors) { | |
| const t0 = Date.now(); | |
| let jsdom = this.linkedom.parseHTML(sourceHTML); | |
| if (!jsdom.window.document.documentElement) { | |
| jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`); | |
| } | |
| for (const rl of discardSelectors) { | |
| jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); | |
| } | |
| jsdom.window.document.querySelectorAll('img[src],img[data-src]').forEach((x) => { | |
| const src = x.getAttribute('src') || x.getAttribute('data-src'); | |
| if (src?.startsWith('data:')) { | |
| x.setAttribute('src', 'blob:opaque'); | |
| } | |
| x.removeAttribute('data-src'); | |
| x.removeAttribute('srcset'); | |
| }); | |
| jsdom.window.document.querySelectorAll('[class]').forEach((x) => { | |
| const classes = x.getAttribute('class')?.split(/\s+/g) || []; | |
| const newClasses = classes.filter((c) => !tailwind_classes_1.tailwindClasses.has(c)); | |
| x.setAttribute('class', newClasses.join(' ')); | |
| }); | |
| jsdom.window.document.querySelectorAll('[style]').forEach((x) => { | |
| const style = x.getAttribute('style')?.toLocaleLowerCase() || ''; | |
| if (style.startsWith('display: none')) { | |
| return; | |
| } | |
| x.removeAttribute('style'); | |
| }); | |
| const treeWalker = jsdom.window.document.createTreeWalker(jsdom.window.document, // Start from the root document | |
| 0x80 // Only show comment nodes | |
| ); | |
| let currentNode; | |
| while ((currentNode = treeWalker.nextNode())) { | |
| currentNode.parentNode?.removeChild(currentNode); // Remove each comment node | |
| } | |
| jsdom.window.document.querySelectorAll('*').forEach((x) => { | |
| const attrs = x.getAttributeNames(); | |
| for (const attr of attrs) { | |
| if (attr.startsWith('data-') || attr.startsWith('aria-')) { | |
| x.removeAttribute(attr); | |
| } | |
| } | |
| }); | |
| const final = this.cleanRedundantEmptyLines(jsdom.window.document.documentElement.outerHTML); | |
| const dt = Date.now() - t0; | |
| if (dt > 1000) { | |
| this.logger.warn(`Performance issue: Cleaning HTML for LMs took ${dt}ms`, { dt }); | |
| } | |
| return final; | |
| } | |
| snippetToElement(snippet, url) { | |
| const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>'); | |
| // Hack for turndown gfm table plugin. | |
| parsed.window.document.querySelectorAll('table').forEach((x) => { | |
| Object.defineProperty(x, 'rows', { value: Array.from(x.querySelectorAll('tr')), enumerable: true }); | |
| }); | |
| Object.defineProperty(parsed.window.document.documentElement, 'cloneNode', { | |
| value: function () { return this; }, | |
| }); | |
| return parsed.window.document.documentElement; | |
| } | |
| runTurndown(turndownService, html) { | |
| const t0 = Date.now(); | |
| try { | |
| return turndownService.turndown(html); | |
| } | |
| finally { | |
| const dt = Date.now() - t0; | |
| if (dt > 1000) { | |
| this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt }); | |
| } | |
| } | |
| } | |
| async analyzeHTMLTextLite(sourceHTML) { | |
| let jsdom = this.linkedom.parseHTML(sourceHTML); | |
| if (!jsdom.window.document.documentElement) { | |
| jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`); | |
| } | |
| jsdom.window.document.querySelectorAll('script,style,link,svg').forEach((s) => s.remove()); | |
| const text = jsdom.window.document.body.innerText || ''; | |
| return { | |
| title: jsdom.window.document.title, | |
| text, | |
| tokens: (0, openai_1.countGPTToken)(text.replaceAll(/[\s\r\n\t]+/g, ' ')), | |
| }; | |
| } | |
| }; | |
| exports.JSDomControl = JSDomControl; | |
| __decorate([ | |
| (0, threaded_1.Threaded)(), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [Object, Object]), | |
| __metadata("design:returntype", Promise) | |
| ], JSDomControl.prototype, "actualNarrowSnapshot", null); | |
| __decorate([ | |
| (0, threaded_1.Threaded)(), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [Object]), | |
| __metadata("design:returntype", Promise) | |
| ], JSDomControl.prototype, "inferSnapshot", null); | |
| __decorate([ | |
| (0, threaded_1.Threaded)(), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [String, String]), | |
| __metadata("design:returntype", Promise) | |
| ], JSDomControl.prototype, "cleanHTMLforLMs", null); | |
| __decorate([ | |
| (0, threaded_1.Threaded)(), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [String]), | |
| __metadata("design:returntype", Promise) | |
| ], JSDomControl.prototype, "analyzeHTMLTextLite", null); | |
| exports.JSDomControl = JSDomControl = __decorate([ | |
| (0, tsyringe_1.singleton)(), | |
| __metadata("design:paramtypes", [logger_1.GlobalLogger]) | |
| ], JSDomControl); | |
| const jsdomControl = tsyringe_1.container.resolve(JSDomControl); | |
| exports.default = jsdomControl; | |
| //# sourceMappingURL=jsdom.js.map |