Spaces:
Build error
Build error
fix: jsdom, cache tolerance, screenshot pricing
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -24,6 +24,7 @@ import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-a
|
|
| 24 |
import { PDFExtractor } from '../services/pdf-extract';
|
| 25 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 26 |
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
|
|
|
| 27 |
|
| 28 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 29 |
|
|
@@ -74,6 +75,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 74 |
constructor(
|
| 75 |
protected globalLogger: Logger,
|
| 76 |
protected puppeteerControl: PuppeteerControl,
|
|
|
|
| 77 |
protected altTextService: AltTextService,
|
| 78 |
protected pdfExtractor: PDFExtractor,
|
| 79 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
|
@@ -247,7 +249,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 247 |
}
|
| 248 |
|
| 249 |
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 250 |
-
const inferred = this.
|
| 251 |
const mixin: any = {};
|
| 252 |
if (this.threadLocal.get('withImagesSummary')) {
|
| 253 |
const imageSummary = {} as { [k: string]: string; };
|
|
@@ -296,6 +298,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 296 |
|
| 297 |
return {
|
| 298 |
...this.getGeneralSnapshotMixins(snapshot),
|
|
|
|
| 299 |
screenshotUrl: snapshot.screenshotUrl,
|
| 300 |
toString() {
|
| 301 |
return this.screenshotUrl;
|
|
@@ -353,16 +356,20 @@ export class CrawlerHost extends RPCHost {
|
|
| 353 |
break;
|
| 354 |
}
|
| 355 |
|
| 356 |
-
|
|
|
|
| 357 |
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
| 358 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 359 |
-
const
|
| 360 |
-
const
|
|
|
|
| 361 |
|
| 362 |
// If Readability did its job
|
| 363 |
if (par2.length >= 0.3 * par1.length) {
|
| 364 |
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
|
| 365 |
-
|
|
|
|
|
|
|
| 366 |
}
|
| 367 |
}
|
| 368 |
|
|
@@ -453,7 +460,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 453 |
|
| 454 |
if (
|
| 455 |
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
| 456 |
-
&& toBeTurnedToMd !==
|
| 457 |
) {
|
| 458 |
try {
|
| 459 |
contentText = turnDownService.turndown(snapshot.html);
|
|
@@ -533,7 +540,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 533 |
.value();
|
| 534 |
}
|
| 535 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 536 |
-
formatted.links = _.invert(this.
|
| 537 |
}
|
| 538 |
|
| 539 |
return formatted as FormattedPage;
|
|
@@ -890,19 +897,19 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 890 |
text: '',
|
| 891 |
} as PageSnapshot;
|
| 892 |
|
| 893 |
-
yield this.
|
| 894 |
|
| 895 |
return;
|
| 896 |
}
|
| 897 |
let cache;
|
| 898 |
|
| 899 |
-
const cacheTolerance = crawlerOpts?.cacheTolerance
|
| 900 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 901 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 902 |
}
|
| 903 |
|
| 904 |
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
| 905 |
-
yield this.
|
| 906 |
|
| 907 |
return;
|
| 908 |
}
|
|
@@ -910,7 +917,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 910 |
try {
|
| 911 |
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
|
| 912 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 913 |
-
yield this.
|
| 914 |
}
|
| 915 |
|
| 916 |
return;
|
|
@@ -920,7 +927,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 920 |
} catch (err: any) {
|
| 921 |
if (cache && !(err instanceof SecurityCompromiseError)) {
|
| 922 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 923 |
-
yield this.
|
| 924 |
return;
|
| 925 |
}
|
| 926 |
throw err;
|
|
@@ -1051,5 +1058,4 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 1051 |
|
| 1052 |
return this.formatSnapshot(mode, lastSnapshot, url);
|
| 1053 |
}
|
| 1054 |
-
|
| 1055 |
}
|
|
|
|
| 24 |
import { PDFExtractor } from '../services/pdf-extract';
|
| 25 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 26 |
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
| 27 |
+
import { JSDomControl } from '../services/jsdom';
|
| 28 |
|
| 29 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 30 |
|
|
|
|
| 75 |
constructor(
|
| 76 |
protected globalLogger: Logger,
|
| 77 |
protected puppeteerControl: PuppeteerControl,
|
| 78 |
+
protected jsdomControl: JSDomControl,
|
| 79 |
protected altTextService: AltTextService,
|
| 80 |
protected pdfExtractor: PDFExtractor,
|
| 81 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
|
|
|
| 249 |
}
|
| 250 |
|
| 251 |
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 252 |
+
const inferred = this.jsdomControl.inferSnapshot(snapshot);
|
| 253 |
const mixin: any = {};
|
| 254 |
if (this.threadLocal.get('withImagesSummary')) {
|
| 255 |
const imageSummary = {} as { [k: string]: string; };
|
|
|
|
| 298 |
|
| 299 |
return {
|
| 300 |
...this.getGeneralSnapshotMixins(snapshot),
|
| 301 |
+
html: snapshot.html,
|
| 302 |
screenshotUrl: snapshot.screenshotUrl,
|
| 303 |
toString() {
|
| 304 |
return this.screenshotUrl;
|
|
|
|
| 356 |
break;
|
| 357 |
}
|
| 358 |
|
| 359 |
+
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 360 |
+
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 361 |
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
| 362 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 363 |
+
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 364 |
+
const par1 = turnDownService.turndown(jsDomElementOfHTML);
|
| 365 |
+
const par2 = snapshot.parsed.content ? turnDownService.turndown(jsDomElementOfParsed) : '';
|
| 366 |
|
| 367 |
// If Readability did its job
|
| 368 |
if (par2.length >= 0.3 * par1.length) {
|
| 369 |
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
|
| 370 |
+
if (snapshot.parsed.content) {
|
| 371 |
+
toBeTurnedToMd = jsDomElementOfParsed;
|
| 372 |
+
}
|
| 373 |
}
|
| 374 |
}
|
| 375 |
|
|
|
|
| 460 |
|
| 461 |
if (
|
| 462 |
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
| 463 |
+
&& toBeTurnedToMd !== jsDomElementOfHTML
|
| 464 |
) {
|
| 465 |
try {
|
| 466 |
contentText = turnDownService.turndown(snapshot.html);
|
|
|
|
| 540 |
.value();
|
| 541 |
}
|
| 542 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 543 |
+
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
|
| 544 |
}
|
| 545 |
|
| 546 |
return formatted as FormattedPage;
|
|
|
|
| 897 |
text: '',
|
| 898 |
} as PageSnapshot;
|
| 899 |
|
| 900 |
+
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
| 901 |
|
| 902 |
return;
|
| 903 |
}
|
| 904 |
let cache;
|
| 905 |
|
| 906 |
+
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
|
| 907 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 908 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 909 |
}
|
| 910 |
|
| 911 |
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
| 912 |
+
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 913 |
|
| 914 |
return;
|
| 915 |
}
|
|
|
|
| 917 |
try {
|
| 918 |
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
|
| 919 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 920 |
+
yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
|
| 921 |
}
|
| 922 |
|
| 923 |
return;
|
|
|
|
| 927 |
} catch (err: any) {
|
| 928 |
if (cache && !(err instanceof SecurityCompromiseError)) {
|
| 929 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 930 |
+
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 931 |
return;
|
| 932 |
}
|
| 933 |
throw err;
|
|
|
|
| 1058 |
|
| 1059 |
return this.formatSnapshot(mode, lastSnapshot, url);
|
| 1060 |
}
|
|
|
|
| 1061 |
}
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -173,7 +173,7 @@ export class SearcherHost extends RPCHost {
|
|
| 173 |
}
|
| 174 |
|
| 175 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 176 |
-
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance
|
| 177 |
count,
|
| 178 |
);
|
| 179 |
|
|
|
|
| 173 |
}
|
| 174 |
|
| 175 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 176 |
+
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs },
|
| 177 |
count,
|
| 178 |
);
|
| 179 |
|
backend/functions/src/services/jsdom.ts
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { container, singleton } from 'tsyringe';
|
| 2 |
+
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
+
import { Logger } from '../shared/services/logger';
|
| 4 |
+
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
| 5 |
+
import { JSDOM, VirtualConsole } from 'jsdom';
|
| 6 |
+
import { Readability } from '@mozilla/readability';
|
| 7 |
+
|
| 8 |
+
const virtualConsole = new VirtualConsole();
|
| 9 |
+
virtualConsole.on('error', () => void 0);
|
| 10 |
+
|
| 11 |
+
@singleton()
|
| 12 |
+
export class JSDomControl extends AsyncService {
|
| 13 |
+
|
| 14 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 15 |
+
|
| 16 |
+
constructor(
|
| 17 |
+
protected globalLogger: Logger,
|
| 18 |
+
) {
|
| 19 |
+
super(...arguments);
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
override async init() {
|
| 23 |
+
await this.dependencyReady();
|
| 24 |
+
this.emit('ready');
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 28 |
+
targetSelector?: string | string[];
|
| 29 |
+
removeSelector?: string | string[];
|
| 30 |
+
withIframe?: boolean;
|
| 31 |
+
}): PageSnapshot | undefined {
|
| 32 |
+
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
| 33 |
+
return snapshot;
|
| 34 |
+
}
|
| 35 |
+
if (!snapshot?.html) {
|
| 36 |
+
return snapshot;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 40 |
+
const allNodes: Node[] = [];
|
| 41 |
+
if (options?.withIframe) {
|
| 42 |
+
jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => {
|
| 43 |
+
const src = x.getAttribute('src');
|
| 44 |
+
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
|
| 45 |
+
if (thisSnapshot?.html) {
|
| 46 |
+
x.innerHTML = thisSnapshot.html;
|
| 47 |
+
x.querySelectorAll('script, style').forEach((s) => s.remove());
|
| 48 |
+
x.querySelectorAll('[src]').forEach((el) => {
|
| 49 |
+
el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
|
| 50 |
+
});
|
| 51 |
+
x.querySelectorAll('[href]').forEach((el) => {
|
| 52 |
+
el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
|
| 53 |
+
});
|
| 54 |
+
}
|
| 55 |
+
});
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
if (Array.isArray(options?.removeSelector)) {
|
| 59 |
+
for (const rl of options!.removeSelector) {
|
| 60 |
+
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 61 |
+
}
|
| 62 |
+
} else if (options?.removeSelector) {
|
| 63 |
+
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
if (Array.isArray(options?.targetSelector)) {
|
| 67 |
+
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 68 |
+
x.forEach((el) => {
|
| 69 |
+
if (!allNodes.includes(el)) {
|
| 70 |
+
allNodes.push(el);
|
| 71 |
+
}
|
| 72 |
+
});
|
| 73 |
+
}
|
| 74 |
+
} else if (options?.targetSelector) {
|
| 75 |
+
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
| 76 |
+
if (!allNodes.includes(el)) {
|
| 77 |
+
allNodes.push(el);
|
| 78 |
+
}
|
| 79 |
+
});
|
| 80 |
+
} else {
|
| 81 |
+
allNodes.push(jsdom.window.document);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
if (!allNodes.length) {
|
| 85 |
+
return snapshot;
|
| 86 |
+
}
|
| 87 |
+
const textChunks: string[] = [];
|
| 88 |
+
let rootDoc: Document;
|
| 89 |
+
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
| 90 |
+
rootDoc = allNodes[0] as any;
|
| 91 |
+
if (rootDoc.body.textContent) {
|
| 92 |
+
textChunks.push(rootDoc.body.textContent);
|
| 93 |
+
}
|
| 94 |
+
} else {
|
| 95 |
+
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
|
| 96 |
+
for (const n of allNodes) {
|
| 97 |
+
rootDoc.body.appendChild(n);
|
| 98 |
+
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
| 99 |
+
if (n.textContent) {
|
| 100 |
+
textChunks.push(n.textContent);
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
let parsed;
|
| 106 |
+
try {
|
| 107 |
+
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
| 108 |
+
} catch (err: any) {
|
| 109 |
+
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// No innerText in jsdom
|
| 113 |
+
// https://github.com/jsdom/jsdom/issues/1245
|
| 114 |
+
const textContent = textChunks.join('\n\n');
|
| 115 |
+
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
| 116 |
+
|
| 117 |
+
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
| 118 |
+
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
| 119 |
+
.flat()
|
| 120 |
+
.map((x) => {
|
| 121 |
+
try {
|
| 122 |
+
return new URL(x, snapshot.href).toString();
|
| 123 |
+
} catch (err) {
|
| 124 |
+
return null;
|
| 125 |
+
}
|
| 126 |
+
})
|
| 127 |
+
.filter(Boolean);
|
| 128 |
+
|
| 129 |
+
const imageSet = new Set(imageTags);
|
| 130 |
+
|
| 131 |
+
const r = {
|
| 132 |
+
...snapshot,
|
| 133 |
+
title: snapshot.title || jsdom.window.document.title,
|
| 134 |
+
parsed,
|
| 135 |
+
html: rootDoc.documentElement.outerHTML,
|
| 136 |
+
text: cleanedText,
|
| 137 |
+
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 138 |
+
} as PageSnapshot;
|
| 139 |
+
|
| 140 |
+
return r;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
| 144 |
+
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 145 |
+
try {
|
| 146 |
+
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 147 |
+
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
| 148 |
+
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
|
| 149 |
+
.map(([href, text]) => {
|
| 150 |
+
if (!text) {
|
| 151 |
+
return undefined;
|
| 152 |
+
}
|
| 153 |
+
try {
|
| 154 |
+
const parsed = new URL(href, snapshot.href);
|
| 155 |
+
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
|
| 156 |
+
return undefined;
|
| 157 |
+
}
|
| 158 |
+
return [parsed.toString(), text] as const;
|
| 159 |
+
} catch (err) {
|
| 160 |
+
return undefined;
|
| 161 |
+
}
|
| 162 |
+
})
|
| 163 |
+
.filter(Boolean)
|
| 164 |
+
.reduce((acc, pair) => {
|
| 165 |
+
acc[pair![0]] = pair![1];
|
| 166 |
+
return acc;
|
| 167 |
+
}, {} as { [k: string]: string; });
|
| 168 |
+
|
| 169 |
+
extendedSnapshot.links = links;
|
| 170 |
+
|
| 171 |
+
const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]'))
|
| 172 |
+
.map((x: any) => {
|
| 173 |
+
let linkPreferredSrc = x.getAttribute('src') || '';
|
| 174 |
+
if (linkPreferredSrc.startsWith('data:')) {
|
| 175 |
+
const dataSrc = x.getAttribute('data-src') || '';
|
| 176 |
+
if (dataSrc && !dataSrc.startsWith('data:')) {
|
| 177 |
+
linkPreferredSrc = dataSrc;
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
return {
|
| 182 |
+
src: new URL(linkPreferredSrc, snapshot.href).toString(),
|
| 183 |
+
width: parseInt(x.getAttribute('width') || '0'),
|
| 184 |
+
height: parseInt(x.getAttribute('height') || '0'),
|
| 185 |
+
alt: x.getAttribute('alt') || x.getAttribute('title'),
|
| 186 |
+
};
|
| 187 |
+
});
|
| 188 |
+
|
| 189 |
+
extendedSnapshot.imgs = imgs as any;
|
| 190 |
+
} catch (_err) {
|
| 191 |
+
void 0;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
return extendedSnapshot;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
snippetToElement(snippet?: string, url?: string) {
|
| 198 |
+
const parsed = new JSDOM(snippet || '', { url, virtualConsole });
|
| 199 |
+
|
| 200 |
+
return parsed.window.document.documentElement;
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
const jsdomControl = container.resolve(JSDomControl);
|
| 205 |
+
|
| 206 |
+
export default jsdomControl;
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -3,7 +3,6 @@ import fs from 'fs';
|
|
| 3 |
import { container, singleton } from 'tsyringe';
|
| 4 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
| 5 |
import { Logger } from '../shared/services/logger';
|
| 6 |
-
import { JSDOM, VirtualConsole } from 'jsdom';
|
| 7 |
|
| 8 |
import type { Browser, CookieParam, Page } from 'puppeteer';
|
| 9 |
import puppeteer from 'puppeteer-extra';
|
|
@@ -11,16 +10,12 @@ import puppeteer from 'puppeteer-extra';
|
|
| 11 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
| 12 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 13 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 14 |
-
import { Readability } from '@mozilla/readability';
|
| 15 |
import { TimeoutError } from 'puppeteer';
|
| 16 |
const tldExtract = require('tld-extract');
|
| 17 |
|
| 18 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 19 |
|
| 20 |
|
| 21 |
-
const virtualConsole = new VirtualConsole();
|
| 22 |
-
virtualConsole.on('error', () => void 0);
|
| 23 |
-
|
| 24 |
export interface ImgBrief {
|
| 25 |
src: string;
|
| 26 |
loaded?: boolean;
|
|
@@ -685,175 +680,6 @@ document.addEventListener('load', handlePageLoad);
|
|
| 685 |
return r.filter(Boolean);
|
| 686 |
}
|
| 687 |
|
| 688 |
-
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 689 |
-
targetSelector?: string | string[];
|
| 690 |
-
removeSelector?: string | string[];
|
| 691 |
-
withIframe?: boolean;
|
| 692 |
-
}): PageSnapshot | undefined {
|
| 693 |
-
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
| 694 |
-
return snapshot;
|
| 695 |
-
}
|
| 696 |
-
if (!snapshot?.html) {
|
| 697 |
-
return snapshot;
|
| 698 |
-
}
|
| 699 |
-
|
| 700 |
-
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 701 |
-
const allNodes: Node[] = [];
|
| 702 |
-
if (options?.withIframe) {
|
| 703 |
-
jsdom.window.document.querySelectorAll('iframe[src]').forEach((x) => {
|
| 704 |
-
const src = x.getAttribute('src');
|
| 705 |
-
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
|
| 706 |
-
if (thisSnapshot?.html) {
|
| 707 |
-
x.innerHTML = thisSnapshot.html;
|
| 708 |
-
x.querySelectorAll('script, style').forEach((s) => s.remove());
|
| 709 |
-
x.querySelectorAll('[src]').forEach((el) => {
|
| 710 |
-
el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
|
| 711 |
-
});
|
| 712 |
-
x.querySelectorAll('[href]').forEach((el) => {
|
| 713 |
-
el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
|
| 714 |
-
});
|
| 715 |
-
}
|
| 716 |
-
});
|
| 717 |
-
}
|
| 718 |
-
|
| 719 |
-
if (Array.isArray(options?.removeSelector)) {
|
| 720 |
-
for (const rl of options!.removeSelector) {
|
| 721 |
-
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 722 |
-
}
|
| 723 |
-
} else if (options?.removeSelector) {
|
| 724 |
-
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
| 725 |
-
}
|
| 726 |
-
|
| 727 |
-
if (Array.isArray(options?.targetSelector)) {
|
| 728 |
-
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 729 |
-
x.forEach((el) => {
|
| 730 |
-
if (!allNodes.includes(el)) {
|
| 731 |
-
allNodes.push(el);
|
| 732 |
-
}
|
| 733 |
-
});
|
| 734 |
-
}
|
| 735 |
-
} else if (options?.targetSelector) {
|
| 736 |
-
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
| 737 |
-
if (!allNodes.includes(el)) {
|
| 738 |
-
allNodes.push(el);
|
| 739 |
-
}
|
| 740 |
-
});
|
| 741 |
-
} else {
|
| 742 |
-
allNodes.push(jsdom.window.document);
|
| 743 |
-
}
|
| 744 |
-
|
| 745 |
-
if (!allNodes.length) {
|
| 746 |
-
return snapshot;
|
| 747 |
-
}
|
| 748 |
-
const textChunks: string[] = [];
|
| 749 |
-
let rootDoc: Document;
|
| 750 |
-
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
| 751 |
-
rootDoc = allNodes[0] as any;
|
| 752 |
-
if (rootDoc.body.textContent) {
|
| 753 |
-
textChunks.push(rootDoc.body.textContent);
|
| 754 |
-
}
|
| 755 |
-
} else {
|
| 756 |
-
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
|
| 757 |
-
for (const n of allNodes) {
|
| 758 |
-
rootDoc.body.appendChild(n);
|
| 759 |
-
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
| 760 |
-
if (n.textContent) {
|
| 761 |
-
textChunks.push(n.textContent);
|
| 762 |
-
}
|
| 763 |
-
}
|
| 764 |
-
}
|
| 765 |
-
|
| 766 |
-
let parsed;
|
| 767 |
-
try {
|
| 768 |
-
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
| 769 |
-
} catch (err: any) {
|
| 770 |
-
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 771 |
-
}
|
| 772 |
-
|
| 773 |
-
// No innerText in jsdom
|
| 774 |
-
// https://github.com/jsdom/jsdom/issues/1245
|
| 775 |
-
const textContent = textChunks.join('\n\n');
|
| 776 |
-
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
| 777 |
-
|
| 778 |
-
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
| 779 |
-
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
| 780 |
-
.flat()
|
| 781 |
-
.map((x) => {
|
| 782 |
-
try {
|
| 783 |
-
return new URL(x, snapshot.href).toString();
|
| 784 |
-
} catch (err) {
|
| 785 |
-
return null;
|
| 786 |
-
}
|
| 787 |
-
})
|
| 788 |
-
.filter(Boolean);
|
| 789 |
-
|
| 790 |
-
const imageSet = new Set(imageTags);
|
| 791 |
-
|
| 792 |
-
const r = {
|
| 793 |
-
...snapshot,
|
| 794 |
-
title: snapshot.title || jsdom.window.document.title,
|
| 795 |
-
parsed,
|
| 796 |
-
html: rootDoc.documentElement.outerHTML,
|
| 797 |
-
text: cleanedText,
|
| 798 |
-
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 799 |
-
} as PageSnapshot;
|
| 800 |
-
|
| 801 |
-
return r;
|
| 802 |
-
}
|
| 803 |
-
|
| 804 |
-
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
| 805 |
-
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 806 |
-
try {
|
| 807 |
-
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 808 |
-
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
| 809 |
-
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
|
| 810 |
-
.map(([href, text]) => {
|
| 811 |
-
if (!text) {
|
| 812 |
-
return undefined;
|
| 813 |
-
}
|
| 814 |
-
try {
|
| 815 |
-
const parsed = new URL(href, snapshot.href);
|
| 816 |
-
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
|
| 817 |
-
return undefined;
|
| 818 |
-
}
|
| 819 |
-
return [parsed.toString(), text] as const;
|
| 820 |
-
} catch (err) {
|
| 821 |
-
return undefined;
|
| 822 |
-
}
|
| 823 |
-
})
|
| 824 |
-
.filter(Boolean)
|
| 825 |
-
.reduce((acc, pair) => {
|
| 826 |
-
acc[pair![0]] = pair![1];
|
| 827 |
-
return acc;
|
| 828 |
-
}, {} as { [k: string]: string; });
|
| 829 |
-
|
| 830 |
-
extendedSnapshot.links = links;
|
| 831 |
-
|
| 832 |
-
const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]'))
|
| 833 |
-
.map((x: any) => {
|
| 834 |
-
let linkPreferredSrc = x.getAttribute('src') || '';
|
| 835 |
-
if (linkPreferredSrc.startsWith('data:')) {
|
| 836 |
-
const dataSrc = x.getAttribute('data-src') || '';
|
| 837 |
-
if (dataSrc && !dataSrc.startsWith('data:')) {
|
| 838 |
-
linkPreferredSrc = dataSrc;
|
| 839 |
-
}
|
| 840 |
-
}
|
| 841 |
-
|
| 842 |
-
return {
|
| 843 |
-
src: new URL(linkPreferredSrc, snapshot.href).toString(),
|
| 844 |
-
width: parseInt(x.getAttribute('width') || '0'),
|
| 845 |
-
height: parseInt(x.getAttribute('height') || '0'),
|
| 846 |
-
alt: x.getAttribute('alt') || x.getAttribute('title'),
|
| 847 |
-
};
|
| 848 |
-
});
|
| 849 |
-
|
| 850 |
-
extendedSnapshot.imgs = imgs as any;
|
| 851 |
-
} catch (_err) {
|
| 852 |
-
void 0;
|
| 853 |
-
}
|
| 854 |
-
|
| 855 |
-
return extendedSnapshot;
|
| 856 |
-
}
|
| 857 |
}
|
| 858 |
|
| 859 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
|
|
|
| 3 |
import { container, singleton } from 'tsyringe';
|
| 4 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
| 5 |
import { Logger } from '../shared/services/logger';
|
|
|
|
| 6 |
|
| 7 |
import type { Browser, CookieParam, Page } from 'puppeteer';
|
| 8 |
import puppeteer from 'puppeteer-extra';
|
|
|
|
| 10 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
| 11 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 12 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
|
|
|
| 13 |
import { TimeoutError } from 'puppeteer';
|
| 14 |
const tldExtract = require('tld-extract');
|
| 15 |
|
| 16 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
| 19 |
export interface ImgBrief {
|
| 20 |
src: string;
|
| 21 |
loaded?: boolean;
|
|
|
|
| 680 |
return r.filter(Boolean);
|
| 681 |
}
|
| 682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
}
|
| 684 |
|
| 685 |
const puppeteerControl = container.resolve(PuppeteerControl);
|