"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.JSDomControl = void 0; const tsyringe_1 = require("tsyringe"); const logger_1 = require("./logger"); const readability_1 = require("@mozilla/readability"); const threaded_1 = require("../services/threaded"); const tailwind_classes_1 = require("../utils/tailwind-classes"); const openai_1 = require("../shared/utils/openai"); const async_service_1 = require("civkit/async-service"); const civ_rpc_1 = require("civkit/civ-rpc"); const pLinkedom = import('linkedom'); let JSDomControl = class JSDomControl extends async_service_1.AsyncService { constructor(globalLogger) { super(...arguments); this.globalLogger = globalLogger; this.logger = this.globalLogger.child({ service: this.constructor.name }); } async init() { await this.dependencyReady(); this.linkedom = await pLinkedom; this.emit('ready'); } async narrowSnapshot(snapshot, options) { if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe && !options?.withShadowDom) { return snapshot; } if (!snapshot?.html) { return snapshot; } try { // SideLoad contains native objects that cannot go through thread boundaries. return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined }); } catch (err) { this.logger.warn(`Error narrowing snapshot`, { err }); if (err instanceof civ_rpc_1.ApplicationError) { throw err; } throw new civ_rpc_1.AssertionFailureError(`Failed to process the page: ${err?.message}`); } } async actualNarrowSnapshot(snapshot, options) { const t0 = Date.now(); let sourceHTML = snapshot.html; if (options?.withShadowDom && snapshot.shadowExpanded) { sourceHTML = snapshot.shadowExpanded; } let jsdom = this.linkedom.parseHTML(sourceHTML); if (!jsdom.window.document.documentElement) { jsdom = this.linkedom.parseHTML(`${sourceHTML}`); } const allNodes = []; jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); if (options?.withIframe) { jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => { const src = x.getAttribute('src'); const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src); if (options?.withIframe === 'quoted') { const blockquoteElem = jsdom.window.document.createElement('blockquote'); const preElem = jsdom.window.document.createElement('pre'); preElem.innerHTML = thisSnapshot?.text || ''; blockquoteElem.appendChild(preElem); x.replaceWith(blockquoteElem); } else if (thisSnapshot?.html) { x.innerHTML = thisSnapshot.html; x.querySelectorAll('script, style').forEach((s) => s.remove()); if (src) { x.querySelectorAll('[src]').forEach((el) => { const imgSrc = el.getAttribute('src'); if (URL.canParse(imgSrc, src)) { el.setAttribute('src', new URL(imgSrc, src).toString()); } }); x.querySelectorAll('[href]').forEach((el) => { const linkHref = el.getAttribute('href'); if (URL.canParse(linkHref, src)) { el.setAttribute('href', new URL(linkHref, src).toString()); } }); } } }); } if (Array.isArray(options?.removeSelector)) { for (const rl of options.removeSelector) { jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); } } else if (options?.removeSelector) { jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove()); } let bewareTargetContentDoesNotExist = false; if (Array.isArray(options?.targetSelector)) { bewareTargetContentDoesNotExist = true; for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { x.forEach((el) => { if (!allNodes.includes(el)) { allNodes.push(el); } }); } } else if (options?.targetSelector) { bewareTargetContentDoesNotExist = true; jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => { if (!allNodes.includes(el)) { allNodes.push(el); } }); } else { allNodes.push(jsdom.window.document); } if (!allNodes.length) { if (bewareTargetContentDoesNotExist) { return undefined; } return snapshot; } const textNodes = []; let rootDoc; if (allNodes.length === 1 && allNodes[0].nodeName === '#document' && allNodes[0].documentElement) { rootDoc = allNodes[0]; if (rootDoc.body?.innerText) { textNodes.push(rootDoc.body); } } else { rootDoc = this.linkedom.parseHTML('').window.document; for (const n of allNodes) { rootDoc.body.appendChild(n); rootDoc.body.appendChild(rootDoc.createTextNode('\n\n')); if (n.innerText) { textNodes.push(n); } } } const textChunks = textNodes.map((x) => { const clone = x.cloneNode(true); clone.querySelectorAll('script,style,link,svg').forEach((s) => s.remove()); return clone.innerText; }); let parsed; try { parsed = new readability_1.Readability(rootDoc.cloneNode(true)).parse(); } catch (err) { this.logger.warn(`Failed to parse selected element`, { err }); } const imgSet = new Set(); const rebuiltImgs = []; Array.from(rootDoc.querySelectorAll('img[src],img[data-src]')) .map((x) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')]) .forEach(([u1, u2, alt]) => { let absUrl; if (u1) { try { const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString(); imgSet.add(u1Txt); absUrl = u1Txt; } catch (err) { // void 0; } } if (u2) { try { const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString(); imgSet.add(u2Txt); absUrl = u2Txt; } catch (err) { // void 0; } } if (absUrl) { rebuiltImgs.push({ src: absUrl, alt }); } }); const r = { ...snapshot, title: snapshot.title || jsdom.window.document.title, description: snapshot.description || (jsdom.window.document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? ''), parsed, html: rootDoc.documentElement.outerHTML, text: textChunks.join('\n'), imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [], }; const dt = Date.now() - t0; if (dt > 1000) { this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt }); } return r; } async inferSnapshot(snapshot) { const t0 = Date.now(); const extendedSnapshot = { ...snapshot }; try { const jsdom = this.linkedom.parseHTML(snapshot.html); jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); const links = Array.from(jsdom.window.document.querySelectorAll('a[href]')) .map((x) => [x.textContent.replace(/\s+/g, ' ').trim(), x.getAttribute('href'),]) .map(([text, href]) => { if (!href) { return undefined; } try { const parsed = new URL(href, snapshot.rebase || snapshot.href); return [text, parsed.toString()]; } catch (err) { return undefined; } }) .filter(Boolean); extendedSnapshot.links = links; const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]')) .map((x) => { let linkPreferredSrc = x.getAttribute('src') || ''; if (linkPreferredSrc.startsWith('data:')) { const dataSrc = x.getAttribute('data-src') || ''; if (dataSrc && !dataSrc.startsWith('data:')) { linkPreferredSrc = dataSrc; } } return { src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(), width: parseInt(x.getAttribute('width') || '0'), height: parseInt(x.getAttribute('height') || '0'), alt: x.getAttribute('alt') || x.getAttribute('title'), }; }); extendedSnapshot.imgs = imgs; } catch (_err) { void 0; } const dt = Date.now() - t0; if (dt > 1000) { this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt }); } return extendedSnapshot; } cleanRedundantEmptyLines(text) { const lines = text.split(/\r?\n/g); const mappedFlag = lines.map((line) => Boolean(line.trim())); return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n'); } async cleanHTMLforLMs(sourceHTML, ...discardSelectors) { const t0 = Date.now(); let jsdom = this.linkedom.parseHTML(sourceHTML); if (!jsdom.window.document.documentElement) { jsdom = this.linkedom.parseHTML(`${sourceHTML}`); } for (const rl of discardSelectors) { jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); } jsdom.window.document.querySelectorAll('img[src],img[data-src]').forEach((x) => { const src = x.getAttribute('src') || x.getAttribute('data-src'); if (src?.startsWith('data:')) { x.setAttribute('src', 'blob:opaque'); } x.removeAttribute('data-src'); x.removeAttribute('srcset'); }); jsdom.window.document.querySelectorAll('[class]').forEach((x) => { const classes = x.getAttribute('class')?.split(/\s+/g) || []; const newClasses = classes.filter((c) => !tailwind_classes_1.tailwindClasses.has(c)); x.setAttribute('class', newClasses.join(' ')); }); jsdom.window.document.querySelectorAll('[style]').forEach((x) => { const style = x.getAttribute('style')?.toLocaleLowerCase() || ''; if (style.startsWith('display: none')) { return; } x.removeAttribute('style'); }); const treeWalker = jsdom.window.document.createTreeWalker(jsdom.window.document, // Start from the root document 0x80 // Only show comment nodes ); let currentNode; while ((currentNode = treeWalker.nextNode())) { currentNode.parentNode?.removeChild(currentNode); // Remove each comment node } jsdom.window.document.querySelectorAll('*').forEach((x) => { const attrs = x.getAttributeNames(); for (const attr of attrs) { if (attr.startsWith('data-') || attr.startsWith('aria-')) { x.removeAttribute(attr); } } }); const final = this.cleanRedundantEmptyLines(jsdom.window.document.documentElement.outerHTML); const dt = Date.now() - t0; if (dt > 1000) { this.logger.warn(`Performance issue: Cleaning HTML for LMs took ${dt}ms`, { dt }); } return final; } snippetToElement(snippet, url) { const parsed = this.linkedom.parseHTML(snippet || ''); // Hack for turndown gfm table plugin. parsed.window.document.querySelectorAll('table').forEach((x) => { Object.defineProperty(x, 'rows', { value: Array.from(x.querySelectorAll('tr')), enumerable: true }); }); Object.defineProperty(parsed.window.document.documentElement, 'cloneNode', { value: function () { return this; }, }); return parsed.window.document.documentElement; } runTurndown(turndownService, html) { const t0 = Date.now(); try { return turndownService.turndown(html); } finally { const dt = Date.now() - t0; if (dt > 1000) { this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt }); } } } async analyzeHTMLTextLite(sourceHTML) { let jsdom = this.linkedom.parseHTML(sourceHTML); if (!jsdom.window.document.documentElement) { jsdom = this.linkedom.parseHTML(`${sourceHTML}`); } jsdom.window.document.querySelectorAll('script,style,link,svg').forEach((s) => s.remove()); const text = jsdom.window.document.body.innerText || ''; return { title: jsdom.window.document.title, text, tokens: (0, openai_1.countGPTToken)(text.replaceAll(/[\s\r\n\t]+/g, ' ')), }; } }; exports.JSDomControl = JSDomControl; __decorate([ (0, threaded_1.Threaded)(), __metadata("design:type", Function), __metadata("design:paramtypes", [Object, Object]), __metadata("design:returntype", Promise) ], JSDomControl.prototype, "actualNarrowSnapshot", null); __decorate([ (0, threaded_1.Threaded)(), __metadata("design:type", Function), __metadata("design:paramtypes", [Object]), __metadata("design:returntype", Promise) ], JSDomControl.prototype, "inferSnapshot", null); __decorate([ (0, threaded_1.Threaded)(), __metadata("design:type", Function), __metadata("design:paramtypes", [String, String]), __metadata("design:returntype", Promise) ], JSDomControl.prototype, "cleanHTMLforLMs", null); __decorate([ (0, threaded_1.Threaded)(), __metadata("design:type", Function), __metadata("design:paramtypes", [String]), __metadata("design:returntype", Promise) ], JSDomControl.prototype, "analyzeHTMLTextLite", null); exports.JSDomControl = JSDomControl = __decorate([ (0, tsyringe_1.singleton)(), __metadata("design:paramtypes", [logger_1.GlobalLogger]) ], JSDomControl); const jsdomControl = tsyringe_1.container.resolve(JSDomControl); exports.default = jsdomControl; //# sourceMappingURL=jsdom.js.map