Spaces:
Build error
Build error
feat: script injecting and tools
Browse files- backend/functions/src/cloud-functions/crawler.ts +35 -7
- backend/functions/src/cloud-functions/searcher.ts +2 -2
- backend/functions/src/dto/scrapping-options.ts +46 -4
- backend/functions/src/services/jsdom.ts +7 -1
- backend/functions/src/services/puppeteer.ts +209 -23
- backend/functions/src/services/snapshot-formatter.ts +1 -1
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -23,7 +23,7 @@ import { JSDomControl } from '../services/jsdom';
|
|
| 23 |
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
| 24 |
|
| 25 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 26 |
-
withIframe?: boolean;
|
| 27 |
withShadowDom?: boolean;
|
| 28 |
targetSelector?: string | string[];
|
| 29 |
removeSelector?: string | string[];
|
|
@@ -69,6 +69,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 69 |
// Potential privacy issue, dont cache if cookies are used
|
| 70 |
return;
|
| 71 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
if (options.locale) {
|
| 73 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 74 |
}
|
|
@@ -237,7 +241,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 237 |
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 238 |
}
|
| 239 |
}
|
| 240 |
-
const crawlOpts = this.configure(crawlerOptions);
|
| 241 |
|
| 242 |
|
| 243 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
@@ -284,7 +288,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 284 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 285 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 286 |
|
| 287 |
-
if (crawlerOptions.
|
| 288 |
return formatted;
|
| 289 |
}
|
| 290 |
|
|
@@ -315,7 +319,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 315 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 316 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 317 |
|
| 318 |
-
if (crawlerOptions.
|
| 319 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 320 |
|
| 321 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
|
@@ -557,8 +561,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 557 |
|
| 558 |
let cache;
|
| 559 |
|
| 560 |
-
|
| 561 |
-
|
| 562 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 563 |
}
|
| 564 |
|
|
@@ -665,7 +669,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 665 |
}
|
| 666 |
}
|
| 667 |
|
| 668 |
-
configure(opts: CrawlerOptions) {
|
| 669 |
|
| 670 |
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
| 671 |
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
|
@@ -697,6 +701,30 @@ export class CrawlerHost extends RPCHost {
|
|
| 697 |
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
| 698 |
}
|
| 699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
return crawlOpts;
|
| 701 |
}
|
| 702 |
|
|
|
|
| 23 |
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
| 24 |
|
| 25 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 26 |
+
withIframe?: boolean | 'quoted';
|
| 27 |
withShadowDom?: boolean;
|
| 28 |
targetSelector?: string | string[];
|
| 29 |
removeSelector?: string | string[];
|
|
|
|
| 69 |
// Potential privacy issue, dont cache if cookies are used
|
| 70 |
return;
|
| 71 |
}
|
| 72 |
+
if (options.injectFrameScripts?.length || options.injectPageScripts?.length) {
|
| 73 |
+
// Potentially mangeled content, dont cache if scripts are injected
|
| 74 |
+
return;
|
| 75 |
+
}
|
| 76 |
if (options.locale) {
|
| 77 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 78 |
}
|
|
|
|
| 241 |
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 242 |
}
|
| 243 |
}
|
| 244 |
+
const crawlOpts = await this.configure(crawlerOptions);
|
| 245 |
|
| 246 |
|
| 247 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
|
|
| 288 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 289 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 290 |
|
| 291 |
+
if (crawlerOptions.isEarlyReturnApplicable()) {
|
| 292 |
return formatted;
|
| 293 |
}
|
| 294 |
|
|
|
|
| 319 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 320 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 321 |
|
| 322 |
+
if (crawlerOptions.isEarlyReturnApplicable()) {
|
| 323 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 324 |
|
| 325 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
|
|
|
| 561 |
|
| 562 |
let cache;
|
| 563 |
|
| 564 |
+
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
| 565 |
+
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
|
| 566 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 567 |
}
|
| 568 |
|
|
|
|
| 669 |
}
|
| 670 |
}
|
| 671 |
|
| 672 |
+
async configure(opts: CrawlerOptions) {
|
| 673 |
|
| 674 |
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
| 675 |
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
|
|
|
| 701 |
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
| 702 |
}
|
| 703 |
|
| 704 |
+
if (opts.injectFrameScript?.length) {
|
| 705 |
+
crawlOpts.injectFrameScripts = (await Promise.all(
|
| 706 |
+
opts.injectFrameScript.map((x) => {
|
| 707 |
+
if (URL.canParse(x)) {
|
| 708 |
+
return fetch(x).then((r) => r.text());
|
| 709 |
+
}
|
| 710 |
+
|
| 711 |
+
return x;
|
| 712 |
+
})
|
| 713 |
+
)).filter(Boolean);
|
| 714 |
+
}
|
| 715 |
+
|
| 716 |
+
if (opts.injectPageScript?.length) {
|
| 717 |
+
crawlOpts.injectPageScripts = (await Promise.all(
|
| 718 |
+
opts.injectPageScript.map((x) => {
|
| 719 |
+
if (URL.canParse(x)) {
|
| 720 |
+
return fetch(x).then((r) => r.text());
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
return x;
|
| 724 |
+
})
|
| 725 |
+
)).filter(Boolean);
|
| 726 |
+
}
|
| 727 |
+
|
| 728 |
return crawlOpts;
|
| 729 |
}
|
| 730 |
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -140,7 +140,7 @@ export class SearcherHost extends RPCHost {
|
|
| 140 |
|
| 141 |
delete crawlerOptions.html;
|
| 142 |
|
| 143 |
-
const crawlOpts = this.crawler.configure(crawlerOptions);
|
| 144 |
const searchQuery = braveSearchExplicitOperators.addTo(q || noSlashPath);
|
| 145 |
const r = await this.cachedWebSearch({
|
| 146 |
q: searchQuery,
|
|
@@ -156,7 +156,7 @@ export class SearcherHost extends RPCHost {
|
|
| 156 |
}
|
| 157 |
|
| 158 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 159 |
-
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs },
|
| 160 |
count,
|
| 161 |
);
|
| 162 |
|
|
|
|
| 140 |
|
| 141 |
delete crawlerOptions.html;
|
| 142 |
|
| 143 |
+
const crawlOpts = await this.crawler.configure(crawlerOptions);
|
| 144 |
const searchQuery = braveSearchExplicitOperators.addTo(q || noSlashPath);
|
| 145 |
const r = await this.cachedWebSearch({
|
| 146 |
q: searchQuery,
|
|
|
|
| 156 |
}
|
| 157 |
|
| 158 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 159 |
+
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 160 |
count,
|
| 161 |
);
|
| 162 |
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -192,8 +192,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 192 |
|
| 193 |
@Prop({
|
| 194 |
default: false,
|
|
|
|
| 195 |
})
|
| 196 |
-
withIframe!: boolean;
|
| 197 |
|
| 198 |
@Prop({
|
| 199 |
default: false,
|
|
@@ -211,6 +212,16 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 211 |
@Prop()
|
| 212 |
userAgent?: string;
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
@Prop({
|
| 215 |
validate: (v: number) => v > 0 && v <= 180,
|
| 216 |
type: Number,
|
|
@@ -293,7 +304,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 293 |
}
|
| 294 |
const withIframe = ctx?.req.get('x-with-iframe');
|
| 295 |
if (withIframe !== undefined) {
|
| 296 |
-
instance.withIframe = Boolean(withIframe);
|
| 297 |
}
|
| 298 |
if (instance.withIframe) {
|
| 299 |
instance.timeout ??= null;
|
|
@@ -330,6 +341,37 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 330 |
|
| 331 |
return instance;
|
| 332 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
}
|
| 334 |
|
| 335 |
export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
|
@@ -347,14 +389,14 @@ function filterSelector(s?: string | string[]) {
|
|
| 347 |
return s;
|
| 348 |
}
|
| 349 |
const sr = Array.isArray(s) ? s : [s];
|
| 350 |
-
const selectors = sr.filter((i)=> {
|
| 351 |
const innerSelectors = i.split(',').map((s) => s.trim());
|
| 352 |
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
|
| 353 |
if (someViolation) {
|
| 354 |
return false;
|
| 355 |
}
|
| 356 |
return true;
|
| 357 |
-
})
|
| 358 |
|
| 359 |
return selectors;
|
| 360 |
};
|
|
|
|
| 192 |
|
| 193 |
@Prop({
|
| 194 |
default: false,
|
| 195 |
+
type: [String, Boolean]
|
| 196 |
})
|
| 197 |
+
withIframe!: boolean | 'quoted';
|
| 198 |
|
| 199 |
@Prop({
|
| 200 |
default: false,
|
|
|
|
| 212 |
@Prop()
|
| 213 |
userAgent?: string;
|
| 214 |
|
| 215 |
+
@Prop({
|
| 216 |
+
arrayOf: String,
|
| 217 |
+
})
|
| 218 |
+
injectPageScript?: string[];
|
| 219 |
+
|
| 220 |
+
@Prop({
|
| 221 |
+
arrayOf: String,
|
| 222 |
+
})
|
| 223 |
+
injectFrameScript?: string[];
|
| 224 |
+
|
| 225 |
@Prop({
|
| 226 |
validate: (v: number) => v > 0 && v <= 180,
|
| 227 |
type: Number,
|
|
|
|
| 304 |
}
|
| 305 |
const withIframe = ctx?.req.get('x-with-iframe');
|
| 306 |
if (withIframe !== undefined) {
|
| 307 |
+
instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
|
| 308 |
}
|
| 309 |
if (instance.withIframe) {
|
| 310 |
instance.timeout ??= null;
|
|
|
|
| 341 |
|
| 342 |
return instance;
|
| 343 |
}
|
| 344 |
+
|
| 345 |
+
isEarlyReturnApplicable() {
|
| 346 |
+
if (this.timeout !== undefined) {
|
| 347 |
+
return false;
|
| 348 |
+
}
|
| 349 |
+
if (this.waitForSelector?.length) {
|
| 350 |
+
return false;
|
| 351 |
+
}
|
| 352 |
+
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 353 |
+
return false;
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
return true;
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
isCacheQueryApplicable() {
|
| 360 |
+
if (this.noCache) {
|
| 361 |
+
return false;
|
| 362 |
+
}
|
| 363 |
+
if (this.cacheTolerance === 0) {
|
| 364 |
+
return false;
|
| 365 |
+
}
|
| 366 |
+
if (this.setCookies?.length) {
|
| 367 |
+
return false;
|
| 368 |
+
}
|
| 369 |
+
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 370 |
+
return false;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
return true;
|
| 374 |
+
}
|
| 375 |
}
|
| 376 |
|
| 377 |
export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
|
|
|
| 389 |
return s;
|
| 390 |
}
|
| 391 |
const sr = Array.isArray(s) ? s : [s];
|
| 392 |
+
const selectors = sr.filter((i) => {
|
| 393 |
const innerSelectors = i.split(',').map((s) => s.trim());
|
| 394 |
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
|
| 395 |
if (someViolation) {
|
| 396 |
return false;
|
| 397 |
}
|
| 398 |
return true;
|
| 399 |
+
});
|
| 400 |
|
| 401 |
return selectors;
|
| 402 |
};
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -53,7 +53,13 @@ export class JSDomControl extends AsyncService {
|
|
| 53 |
jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => {
|
| 54 |
const src = x.getAttribute('src');
|
| 55 |
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
|
| 56 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
x.innerHTML = thisSnapshot.html;
|
| 58 |
x.querySelectorAll('script, style').forEach((s) => s.remove());
|
| 59 |
x.querySelectorAll('[src]').forEach((el) => {
|
|
|
|
| 53 |
jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => {
|
| 54 |
const src = x.getAttribute('src');
|
| 55 |
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
|
| 56 |
+
if (options?.withIframe === 'quoted') {
|
| 57 |
+
const blockquoteElem = jsdom.window.document.createElement('blockquote');
|
| 58 |
+
const preElem = jsdom.window.document.createElement('pre');
|
| 59 |
+
preElem.innerHTML = thisSnapshot?.text || '';
|
| 60 |
+
blockquoteElem.appendChild(preElem);
|
| 61 |
+
x.replaceWith(blockquoteElem);
|
| 62 |
+
} else if (thisSnapshot?.html) {
|
| 63 |
x.innerHTML = thisSnapshot.html;
|
| 64 |
x.querySelectorAll('script, style').forEach((s) => s.remove());
|
| 65 |
x.querySelectorAll('[src]').forEach((el) => {
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -76,6 +76,8 @@ export interface ScrappingOptions {
|
|
| 76 |
locale?: string;
|
| 77 |
referer?: string;
|
| 78 |
extraHeaders?: Record<string, string>;
|
|
|
|
|
|
|
| 79 |
}
|
| 80 |
|
| 81 |
|
|
@@ -95,9 +97,135 @@ puppeteer.use(puppeteerPageProxy({
|
|
| 95 |
interceptResolutionPriority: 1,
|
| 96 |
}));
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
const SCRIPT_TO_INJECT_INTO_FRAME = `
|
| 99 |
${READABILITY_JS}
|
|
|
|
|
|
|
| 100 |
|
|
|
|
| 101 |
function briefImgs(elem) {
|
| 102 |
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
| 103 |
|
|
@@ -271,6 +399,30 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 271 |
|
| 272 |
return r;
|
| 273 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
`;
|
| 275 |
|
| 276 |
@singleton()
|
|
@@ -479,26 +631,29 @@ export class PuppeteerControl extends AsyncService {
|
|
| 479 |
});
|
| 480 |
|
| 481 |
await page.evaluateOnNewDocument(`
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
|
|
|
|
|
|
|
|
|
| 502 |
`);
|
| 503 |
|
| 504 |
this.snMap.set(page, sn);
|
|
@@ -550,9 +705,13 @@ if (window.self === window.top) {
|
|
| 550 |
await Promise.race([
|
| 551 |
(async () => {
|
| 552 |
const ctx = page.browserContext();
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
]).catch((err) => {
|
| 557 |
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
|
| 558 |
});
|
|
@@ -601,6 +760,30 @@ if (window.self === window.top) {
|
|
| 601 |
return req.continue(continueArgs[0], continueArgs[1]);
|
| 602 |
});
|
| 603 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
const sn = this.snMap.get(page);
|
| 605 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 606 |
|
|
@@ -689,6 +872,7 @@ if (window.self === window.top) {
|
|
| 689 |
goToOptions.referer = options.referer;
|
| 690 |
}
|
| 691 |
|
|
|
|
| 692 |
const gotoPromise = page.goto(url, goToOptions)
|
| 693 |
.catch((err) => {
|
| 694 |
if (err instanceof TimeoutError) {
|
|
@@ -719,6 +903,8 @@ if (window.self === window.top) {
|
|
| 719 |
throw stuff;
|
| 720 |
}
|
| 721 |
}
|
|
|
|
|
|
|
| 722 |
try {
|
| 723 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 724 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
|
|
|
| 76 |
locale?: string;
|
| 77 |
referer?: string;
|
| 78 |
extraHeaders?: Record<string, string>;
|
| 79 |
+
injectFrameScripts?: string[];
|
| 80 |
+
injectPageScripts?: string[];
|
| 81 |
}
|
| 82 |
|
| 83 |
|
|
|
|
| 97 |
interceptResolutionPriority: 1,
|
| 98 |
}));
|
| 99 |
|
| 100 |
+
const SIMULATE_SCROLL = `
|
| 101 |
+
(function () {
|
| 102 |
+
function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
|
| 103 |
+
const targetRect = target.getBoundingClientRect();
|
| 104 |
+
const record = {
|
| 105 |
+
target,
|
| 106 |
+
isIntersecting,
|
| 107 |
+
time: timestamp,
|
| 108 |
+
// If intersecting, intersectionRect matches boundingClientRect
|
| 109 |
+
// If not intersecting, intersectionRect is empty (0x0)
|
| 110 |
+
intersectionRect: isIntersecting
|
| 111 |
+
? targetRect
|
| 112 |
+
: new DOMRectReadOnly(0, 0, 0, 0),
|
| 113 |
+
// Current bounding client rect of the target
|
| 114 |
+
boundingClientRect: targetRect,
|
| 115 |
+
// Intersection ratio is either 0 (not intersecting) or 1 (fully intersecting)
|
| 116 |
+
intersectionRatio: isIntersecting ? 1 : 0,
|
| 117 |
+
// Root bounds (viewport in our case)
|
| 118 |
+
rootBounds: new DOMRectReadOnly(
|
| 119 |
+
0,
|
| 120 |
+
0,
|
| 121 |
+
window.innerWidth,
|
| 122 |
+
window.innerHeight
|
| 123 |
+
)
|
| 124 |
+
};
|
| 125 |
+
Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
|
| 126 |
+
return record;
|
| 127 |
+
}
|
| 128 |
+
function cloneIntersectionObserverEntry(entry) {
|
| 129 |
+
const record = {
|
| 130 |
+
target: entry.target,
|
| 131 |
+
isIntersecting: entry.isIntersecting,
|
| 132 |
+
time: entry.time,
|
| 133 |
+
intersectionRect: entry.intersectionRect,
|
| 134 |
+
boundingClientRect: entry.boundingClientRect,
|
| 135 |
+
intersectionRatio: entry.intersectionRatio,
|
| 136 |
+
rootBounds: entry.rootBounds
|
| 137 |
+
};
|
| 138 |
+
Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
|
| 139 |
+
return record;
|
| 140 |
+
}
|
| 141 |
+
const orig = window.IntersectionObserver;
|
| 142 |
+
const kCallback = Symbol('callback');
|
| 143 |
+
const kLastEntryMap = Symbol('lastEntryMap');
|
| 144 |
+
const liveObservers = new Map();
|
| 145 |
+
class MangledIntersectionObserver extends orig {
|
| 146 |
+
constructor(callback, options) {
|
| 147 |
+
super((entries, observer) => {
|
| 148 |
+
const lastEntryMap = observer[kLastEntryMap];
|
| 149 |
+
const lastEntry = entries[entries.length - 1];
|
| 150 |
+
lastEntryMap.set(lastEntry.target, lastEntry);
|
| 151 |
+
return callback(entries, observer);
|
| 152 |
+
}, options);
|
| 153 |
+
this[kCallback] = callback;
|
| 154 |
+
this[kLastEntryMap] = new WeakMap();
|
| 155 |
+
liveObservers.set(this, new Set());
|
| 156 |
+
}
|
| 157 |
+
disconnect() {
|
| 158 |
+
liveObservers.get(this)?.clear();
|
| 159 |
+
liveObservers.delete(this);
|
| 160 |
+
return super.disconnect();
|
| 161 |
+
}
|
| 162 |
+
observe(target) {
|
| 163 |
+
const observer = liveObservers.get(this);
|
| 164 |
+
observer?.add(target);
|
| 165 |
+
return super.observe(target);
|
| 166 |
+
}
|
| 167 |
+
unobserve(target) {
|
| 168 |
+
const observer = liveObservers.get(this);
|
| 169 |
+
observer?.delete(target);
|
| 170 |
+
return super.unobserve(target);
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
Object.defineProperty(MangledIntersectionObserver, 'name', { value: 'IntersectionObserver', writable: false });
|
| 174 |
+
window.IntersectionObserver = MangledIntersectionObserver;
|
| 175 |
+
function simulateScroll() {
|
| 176 |
+
for (const [observer, targets] of liveObservers.entries()) {
|
| 177 |
+
const t0 = performance.now();
|
| 178 |
+
for (const target of targets) {
|
| 179 |
+
const entry = createIntersectionObserverEntry(target, true, t0);
|
| 180 |
+
observer[kCallback]([entry], observer);
|
| 181 |
+
setTimeout(() => {
|
| 182 |
+
const t1 = performance.now();
|
| 183 |
+
const lastEntry = observer[kLastEntryMap].get(target);
|
| 184 |
+
if (!lastEntry) {
|
| 185 |
+
return;
|
| 186 |
+
}
|
| 187 |
+
const entry2 = { ...cloneIntersectionObserverEntry(lastEntry), time: t1 };
|
| 188 |
+
observer[kCallback]([entry2], observer);
|
| 189 |
+
});
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
window.simulateScroll = simulateScroll;
|
| 194 |
+
})();
|
| 195 |
+
`;
|
| 196 |
+
|
| 197 |
+
const MUTATION_IDLE_WATCH = `
|
| 198 |
+
(function () {
|
| 199 |
+
let timeout;
|
| 200 |
+
const sendMsg = ()=> {
|
| 201 |
+
document.dispatchEvent(new CustomEvent('mutationIdle'));
|
| 202 |
+
};
|
| 203 |
+
|
| 204 |
+
const cb = () => {
|
| 205 |
+
if (timeout) {
|
| 206 |
+
clearTimeout(timeout);
|
| 207 |
+
timeout = setTimeout(sendMsg, 200);
|
| 208 |
+
}
|
| 209 |
+
};
|
| 210 |
+
const mutationObserver = new MutationObserver(cb);
|
| 211 |
+
|
| 212 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 213 |
+
mutationObserver.observe(document.documentElement, {
|
| 214 |
+
childList: true,
|
| 215 |
+
subtree: true,
|
| 216 |
+
});
|
| 217 |
+
timeout = setTimeout(sendMsg, 200);
|
| 218 |
+
}, { once: true })
|
| 219 |
+
})();
|
| 220 |
+
`;
|
| 221 |
+
|
| 222 |
+
|
| 223 |
const SCRIPT_TO_INJECT_INTO_FRAME = `
|
| 224 |
${READABILITY_JS}
|
| 225 |
+
${SIMULATE_SCROLL}
|
| 226 |
+
${MUTATION_IDLE_WATCH}
|
| 227 |
|
| 228 |
+
(function(){
|
| 229 |
function briefImgs(elem) {
|
| 230 |
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
| 231 |
|
|
|
|
| 399 |
|
| 400 |
return r;
|
| 401 |
}
|
| 402 |
+
function waitForSelector(selectorText) {
|
| 403 |
+
return new Promise((resolve) => {
|
| 404 |
+
const existing = document.querySelector(selectorText);
|
| 405 |
+
if (existing) {
|
| 406 |
+
resolve(existing);
|
| 407 |
+
return;
|
| 408 |
+
}
|
| 409 |
+
const observer = new MutationObserver(() => {
|
| 410 |
+
const elem = document.querySelector(selectorText);
|
| 411 |
+
if (elem) {
|
| 412 |
+
resolve(document.querySelector(selectorText));
|
| 413 |
+
observer.disconnect();
|
| 414 |
+
}
|
| 415 |
+
});
|
| 416 |
+
observer.observe(document.documentElement, {
|
| 417 |
+
childList: true,
|
| 418 |
+
subtree: true
|
| 419 |
+
});
|
| 420 |
+
});
|
| 421 |
+
}
|
| 422 |
+
window.waitForSelector = waitForSelector;
|
| 423 |
+
window.giveSnapshot = giveSnapshot;
|
| 424 |
+
window.briefImgs = briefImgs;
|
| 425 |
+
})();
|
| 426 |
`;
|
| 427 |
|
| 428 |
@singleton()
|
|
|
|
| 631 |
});
|
| 632 |
|
| 633 |
await page.evaluateOnNewDocument(`
|
| 634 |
+
(function () {
|
| 635 |
+
if (window.self === window.top) {
|
| 636 |
+
let lastTextLength = 0;
|
| 637 |
+
const handlePageLoad = () => {
|
| 638 |
+
const thisTextLength = (document.body.innerText || '').length;
|
| 639 |
+
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
| 640 |
+
if (10 * deltaLength < lastTextLength) {
|
| 641 |
+
// Change is not significant
|
| 642 |
+
return;
|
| 643 |
+
}
|
| 644 |
+
lastTextLength = thisTextLength;
|
| 645 |
+
if (window.haltSnapshot) {
|
| 646 |
+
return;
|
| 647 |
+
}
|
| 648 |
+
const r = giveSnapshot();
|
| 649 |
+
window.reportSnapshot(r);
|
| 650 |
+
};
|
| 651 |
+
document.addEventListener('readystatechange', handlePageLoad);
|
| 652 |
+
document.addEventListener('load', handlePageLoad);
|
| 653 |
+
document.addEventListener('mutationIdle', handlePageLoad);
|
| 654 |
+
}
|
| 655 |
+
document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
|
| 656 |
+
})();
|
| 657 |
`);
|
| 658 |
|
| 659 |
this.snMap.set(page, sn);
|
|
|
|
| 705 |
await Promise.race([
|
| 706 |
(async () => {
|
| 707 |
const ctx = page.browserContext();
|
| 708 |
+
try {
|
| 709 |
+
await page.close();
|
| 710 |
+
} finally {
|
| 711 |
+
await ctx.close();
|
| 712 |
+
}
|
| 713 |
+
})(),
|
| 714 |
+
delay(5000)
|
| 715 |
]).catch((err) => {
|
| 716 |
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
|
| 717 |
});
|
|
|
|
| 760 |
return req.continue(continueArgs[0], continueArgs[1]);
|
| 761 |
});
|
| 762 |
}
|
| 763 |
+
let pageScriptEvaluations: Promise<unknown>[] = [];
|
| 764 |
+
let frameScriptEvaluations: Promise<unknown>[] = [];
|
| 765 |
+
if (options?.injectPageScripts?.length) {
|
| 766 |
+
page.on('framenavigated', (frame) => {
|
| 767 |
+
if (frame !== page.mainFrame()) {
|
| 768 |
+
return;
|
| 769 |
+
}
|
| 770 |
+
|
| 771 |
+
pageScriptEvaluations.push(
|
| 772 |
+
Promise.allSettled(options.injectPageScripts!.map((x) => frame.evaluate(x).catch((err) => {
|
| 773 |
+
this.logger.warn(`Error in evaluation of page scripts`, { err });
|
| 774 |
+
})))
|
| 775 |
+
);
|
| 776 |
+
});
|
| 777 |
+
}
|
| 778 |
+
if (options?.injectFrameScripts?.length) {
|
| 779 |
+
page.on('framenavigated', (frame) => {
|
| 780 |
+
frameScriptEvaluations.push(
|
| 781 |
+
Promise.allSettled(options.injectFrameScripts!.map((x) => frame.evaluate(x).catch((err) => {
|
| 782 |
+
this.logger.warn(`Error in evaluation of frame scripts`, { err });
|
| 783 |
+
})))
|
| 784 |
+
);
|
| 785 |
+
});
|
| 786 |
+
}
|
| 787 |
const sn = this.snMap.get(page);
|
| 788 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 789 |
|
|
|
|
| 872 |
goToOptions.referer = options.referer;
|
| 873 |
}
|
| 874 |
|
| 875 |
+
const delayPromise = delay(timeout);
|
| 876 |
const gotoPromise = page.goto(url, goToOptions)
|
| 877 |
.catch((err) => {
|
| 878 |
if (err instanceof TimeoutError) {
|
|
|
|
| 903 |
throw stuff;
|
| 904 |
}
|
| 905 |
}
|
| 906 |
+
await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
|
| 907 |
+
.catch(() => void 0);
|
| 908 |
try {
|
| 909 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 910 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -316,7 +316,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 316 |
}
|
| 317 |
} while (false);
|
| 318 |
|
| 319 |
-
const cleanText = (contentText || '').trim();
|
| 320 |
|
| 321 |
const formatted: FormattedPage = {
|
| 322 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
|
|
|
| 316 |
}
|
| 317 |
} while (false);
|
| 318 |
|
| 319 |
+
const cleanText = contentText?.includes('return') ? contentText.trimEnd() : (contentText || '').trim();
|
| 320 |
|
| 321 |
const formatted: FormattedPage = {
|
| 322 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|