nomagick commited on
Commit
22647a0
·
unverified ·
1 Parent(s): bd629a8

feat: script injecting and tools

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -23,7 +23,7 @@ import { JSDomControl } from '../services/jsdom';
23
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
24
 
25
  export interface ExtraScrappingOptions extends ScrappingOptions {
26
- withIframe?: boolean;
27
  withShadowDom?: boolean;
28
  targetSelector?: string | string[];
29
  removeSelector?: string | string[];
@@ -69,6 +69,10 @@ export class CrawlerHost extends RPCHost {
69
  // Potential privacy issue, dont cache if cookies are used
70
  return;
71
  }
 
 
 
 
72
  if (options.locale) {
73
  Reflect.set(snapshot, 'locale', options.locale);
74
  }
@@ -237,7 +241,7 @@ export class CrawlerHost extends RPCHost {
237
  throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
238
  }
239
  }
240
- const crawlOpts = this.configure(crawlerOptions);
241
 
242
 
243
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@@ -284,7 +288,7 @@ export class CrawlerHost extends RPCHost {
284
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
285
  chargeAmount = this.assignChargeAmount(formatted);
286
 
287
- if (crawlerOptions.timeout === undefined) {
288
  return formatted;
289
  }
290
 
@@ -315,7 +319,7 @@ export class CrawlerHost extends RPCHost {
315
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
316
  chargeAmount = this.assignChargeAmount(formatted);
317
 
318
- if (crawlerOptions.timeout === undefined) {
319
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
320
 
321
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
@@ -557,8 +561,8 @@ export class CrawlerHost extends RPCHost {
557
 
558
  let cache;
559
 
560
- const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
561
- if (cacheTolerance && !crawlOpts?.cookies?.length) {
562
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
563
  }
564
 
@@ -665,7 +669,7 @@ export class CrawlerHost extends RPCHost {
665
  }
666
  }
667
 
668
- configure(opts: CrawlerOptions) {
669
 
670
  this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
671
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
@@ -697,6 +701,30 @@ export class CrawlerHost extends RPCHost {
697
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
698
  }
699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  return crawlOpts;
701
  }
702
 
 
23
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
24
 
25
  export interface ExtraScrappingOptions extends ScrappingOptions {
26
+ withIframe?: boolean | 'quoted';
27
  withShadowDom?: boolean;
28
  targetSelector?: string | string[];
29
  removeSelector?: string | string[];
 
69
  // Potential privacy issue, dont cache if cookies are used
70
  return;
71
  }
72
+ if (options.injectFrameScripts?.length || options.injectPageScripts?.length) {
73
+ // Potentially mangeled content, dont cache if scripts are injected
74
+ return;
75
+ }
76
  if (options.locale) {
77
  Reflect.set(snapshot, 'locale', options.locale);
78
  }
 
241
  throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
242
  }
243
  }
244
+ const crawlOpts = await this.configure(crawlerOptions);
245
 
246
 
247
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
 
288
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
289
  chargeAmount = this.assignChargeAmount(formatted);
290
 
291
+ if (crawlerOptions.isEarlyReturnApplicable()) {
292
  return formatted;
293
  }
294
 
 
319
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
320
  chargeAmount = this.assignChargeAmount(formatted);
321
 
322
+ if (crawlerOptions.isEarlyReturnApplicable()) {
323
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
324
 
325
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
 
561
 
562
  let cache;
563
 
564
+ if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
565
+ const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
566
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
567
  }
568
 
 
669
  }
670
  }
671
 
672
+ async configure(opts: CrawlerOptions) {
673
 
674
  this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
675
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
 
701
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
702
  }
703
 
704
+ if (opts.injectFrameScript?.length) {
705
+ crawlOpts.injectFrameScripts = (await Promise.all(
706
+ opts.injectFrameScript.map((x) => {
707
+ if (URL.canParse(x)) {
708
+ return fetch(x).then((r) => r.text());
709
+ }
710
+
711
+ return x;
712
+ })
713
+ )).filter(Boolean);
714
+ }
715
+
716
+ if (opts.injectPageScript?.length) {
717
+ crawlOpts.injectPageScripts = (await Promise.all(
718
+ opts.injectPageScript.map((x) => {
719
+ if (URL.canParse(x)) {
720
+ return fetch(x).then((r) => r.text());
721
+ }
722
+
723
+ return x;
724
+ })
725
+ )).filter(Boolean);
726
+ }
727
+
728
  return crawlOpts;
729
  }
730
 
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -140,7 +140,7 @@ export class SearcherHost extends RPCHost {
140
 
141
  delete crawlerOptions.html;
142
 
143
- const crawlOpts = this.crawler.configure(crawlerOptions);
144
  const searchQuery = braveSearchExplicitOperators.addTo(q || noSlashPath);
145
  const r = await this.cachedWebSearch({
146
  q: searchQuery,
@@ -156,7 +156,7 @@ export class SearcherHost extends RPCHost {
156
  }
157
 
158
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
159
- { ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs },
160
  count,
161
  );
162
 
 
140
 
141
  delete crawlerOptions.html;
142
 
143
+ const crawlOpts = await this.crawler.configure(crawlerOptions);
144
  const searchQuery = braveSearchExplicitOperators.addTo(q || noSlashPath);
145
  const r = await this.cachedWebSearch({
146
  q: searchQuery,
 
156
  }
157
 
158
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
159
+ CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
160
  count,
161
  );
162
 
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -192,8 +192,9 @@ export class CrawlerOptions extends AutoCastable {
192
 
193
  @Prop({
194
  default: false,
 
195
  })
196
- withIframe!: boolean;
197
 
198
  @Prop({
199
  default: false,
@@ -211,6 +212,16 @@ export class CrawlerOptions extends AutoCastable {
211
  @Prop()
212
  userAgent?: string;
213
 
 
 
 
 
 
 
 
 
 
 
214
  @Prop({
215
  validate: (v: number) => v > 0 && v <= 180,
216
  type: Number,
@@ -293,7 +304,7 @@ export class CrawlerOptions extends AutoCastable {
293
  }
294
  const withIframe = ctx?.req.get('x-with-iframe');
295
  if (withIframe !== undefined) {
296
- instance.withIframe = Boolean(withIframe);
297
  }
298
  if (instance.withIframe) {
299
  instance.timeout ??= null;
@@ -330,6 +341,37 @@ export class CrawlerOptions extends AutoCastable {
330
 
331
  return instance;
332
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  }
334
 
335
  export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
@@ -347,14 +389,14 @@ function filterSelector(s?: string | string[]) {
347
  return s;
348
  }
349
  const sr = Array.isArray(s) ? s : [s];
350
- const selectors = sr.filter((i)=> {
351
  const innerSelectors = i.split(',').map((s) => s.trim());
352
  const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
353
  if (someViolation) {
354
  return false;
355
  }
356
  return true;
357
- })
358
 
359
  return selectors;
360
  };
 
192
 
193
  @Prop({
194
  default: false,
195
+ type: [String, Boolean]
196
  })
197
+ withIframe!: boolean | 'quoted';
198
 
199
  @Prop({
200
  default: false,
 
212
  @Prop()
213
  userAgent?: string;
214
 
215
+ @Prop({
216
+ arrayOf: String,
217
+ })
218
+ injectPageScript?: string[];
219
+
220
+ @Prop({
221
+ arrayOf: String,
222
+ })
223
+ injectFrameScript?: string[];
224
+
225
  @Prop({
226
  validate: (v: number) => v > 0 && v <= 180,
227
  type: Number,
 
304
  }
305
  const withIframe = ctx?.req.get('x-with-iframe');
306
  if (withIframe !== undefined) {
307
+ instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
308
  }
309
  if (instance.withIframe) {
310
  instance.timeout ??= null;
 
341
 
342
  return instance;
343
  }
344
+
345
+ isEarlyReturnApplicable() {
346
+ if (this.timeout !== undefined) {
347
+ return false;
348
+ }
349
+ if (this.waitForSelector?.length) {
350
+ return false;
351
+ }
352
+ if (this.injectFrameScript?.length || this.injectPageScript?.length) {
353
+ return false;
354
+ }
355
+
356
+ return true;
357
+ }
358
+
359
+ isCacheQueryApplicable() {
360
+ if (this.noCache) {
361
+ return false;
362
+ }
363
+ if (this.cacheTolerance === 0) {
364
+ return false;
365
+ }
366
+ if (this.setCookies?.length) {
367
+ return false;
368
+ }
369
+ if (this.injectFrameScript?.length || this.injectPageScript?.length) {
370
+ return false;
371
+ }
372
+
373
+ return true;
374
+ }
375
  }
376
 
377
  export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
 
389
  return s;
390
  }
391
  const sr = Array.isArray(s) ? s : [s];
392
+ const selectors = sr.filter((i) => {
393
  const innerSelectors = i.split(',').map((s) => s.trim());
394
  const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
395
  if (someViolation) {
396
  return false;
397
  }
398
  return true;
399
+ });
400
 
401
  return selectors;
402
  };
backend/functions/src/services/jsdom.ts CHANGED
@@ -53,7 +53,13 @@ export class JSDomControl extends AsyncService {
53
  jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => {
54
  const src = x.getAttribute('src');
55
  const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
56
- if (thisSnapshot?.html) {
 
 
 
 
 
 
57
  x.innerHTML = thisSnapshot.html;
58
  x.querySelectorAll('script, style').forEach((s) => s.remove());
59
  x.querySelectorAll('[src]').forEach((el) => {
 
53
  jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => {
54
  const src = x.getAttribute('src');
55
  const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
56
+ if (options?.withIframe === 'quoted') {
57
+ const blockquoteElem = jsdom.window.document.createElement('blockquote');
58
+ const preElem = jsdom.window.document.createElement('pre');
59
+ preElem.innerHTML = thisSnapshot?.text || '';
60
+ blockquoteElem.appendChild(preElem);
61
+ x.replaceWith(blockquoteElem);
62
+ } else if (thisSnapshot?.html) {
63
  x.innerHTML = thisSnapshot.html;
64
  x.querySelectorAll('script, style').forEach((s) => s.remove());
65
  x.querySelectorAll('[src]').forEach((el) => {
backend/functions/src/services/puppeteer.ts CHANGED
@@ -76,6 +76,8 @@ export interface ScrappingOptions {
76
  locale?: string;
77
  referer?: string;
78
  extraHeaders?: Record<string, string>;
 
 
79
  }
80
 
81
 
@@ -95,9 +97,135 @@ puppeteer.use(puppeteerPageProxy({
95
  interceptResolutionPriority: 1,
96
  }));
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  const SCRIPT_TO_INJECT_INTO_FRAME = `
99
  ${READABILITY_JS}
 
 
100
 
 
101
  function briefImgs(elem) {
102
  const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
103
 
@@ -271,6 +399,30 @@ function giveSnapshot(stopActiveSnapshot) {
271
 
272
  return r;
273
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  `;
275
 
276
  @singleton()
@@ -479,26 +631,29 @@ export class PuppeteerControl extends AsyncService {
479
  });
480
 
481
  await page.evaluateOnNewDocument(`
482
- if (window.self === window.top) {
483
- let lastTextLength = 0;
484
- const handlePageLoad = () => {
485
- if (window.haltSnapshot) {
486
- return;
487
- }
488
- const thisTextLength = (document.body.innerText || '').length;
489
- const deltaLength = Math.abs(thisTextLength - lastTextLength);
490
- if (10 * deltaLength < lastTextLength) {
491
- // Change is not significant
492
- return;
493
- }
494
- const r = giveSnapshot();
495
- window.reportSnapshot(r);
496
- lastTextLength = thisTextLength;
497
- };
498
- setInterval(handlePageLoad, 800);
499
- document.addEventListener('readystatechange', handlePageLoad);
500
- document.addEventListener('load', handlePageLoad);
501
- }
 
 
 
502
  `);
503
 
504
  this.snMap.set(page, sn);
@@ -550,9 +705,13 @@ if (window.self === window.top) {
550
  await Promise.race([
551
  (async () => {
552
  const ctx = page.browserContext();
553
- await page.close();
554
- await ctx.close();
555
- })(), delay(5000)
 
 
 
 
556
  ]).catch((err) => {
557
  this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
558
  });
@@ -601,6 +760,30 @@ if (window.self === window.top) {
601
  return req.continue(continueArgs[0], continueArgs[1]);
602
  });
603
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  const sn = this.snMap.get(page);
605
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
606
 
@@ -689,6 +872,7 @@ if (window.self === window.top) {
689
  goToOptions.referer = options.referer;
690
  }
691
 
 
692
  const gotoPromise = page.goto(url, goToOptions)
693
  .catch((err) => {
694
  if (err instanceof TimeoutError) {
@@ -719,6 +903,8 @@ if (window.self === window.top) {
719
  throw stuff;
720
  }
721
  }
 
 
722
  try {
723
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
724
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
 
76
  locale?: string;
77
  referer?: string;
78
  extraHeaders?: Record<string, string>;
79
+ injectFrameScripts?: string[];
80
+ injectPageScripts?: string[];
81
  }
82
 
83
 
 
97
  interceptResolutionPriority: 1,
98
  }));
99
 
100
+ const SIMULATE_SCROLL = `
101
+ (function () {
102
+ function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
103
+ const targetRect = target.getBoundingClientRect();
104
+ const record = {
105
+ target,
106
+ isIntersecting,
107
+ time: timestamp,
108
+ // If intersecting, intersectionRect matches boundingClientRect
109
+ // If not intersecting, intersectionRect is empty (0x0)
110
+ intersectionRect: isIntersecting
111
+ ? targetRect
112
+ : new DOMRectReadOnly(0, 0, 0, 0),
113
+ // Current bounding client rect of the target
114
+ boundingClientRect: targetRect,
115
+ // Intersection ratio is either 0 (not intersecting) or 1 (fully intersecting)
116
+ intersectionRatio: isIntersecting ? 1 : 0,
117
+ // Root bounds (viewport in our case)
118
+ rootBounds: new DOMRectReadOnly(
119
+ 0,
120
+ 0,
121
+ window.innerWidth,
122
+ window.innerHeight
123
+ )
124
+ };
125
+ Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
126
+ return record;
127
+ }
128
+ function cloneIntersectionObserverEntry(entry) {
129
+ const record = {
130
+ target: entry.target,
131
+ isIntersecting: entry.isIntersecting,
132
+ time: entry.time,
133
+ intersectionRect: entry.intersectionRect,
134
+ boundingClientRect: entry.boundingClientRect,
135
+ intersectionRatio: entry.intersectionRatio,
136
+ rootBounds: entry.rootBounds
137
+ };
138
+ Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
139
+ return record;
140
+ }
141
+ const orig = window.IntersectionObserver;
142
+ const kCallback = Symbol('callback');
143
+ const kLastEntryMap = Symbol('lastEntryMap');
144
+ const liveObservers = new Map();
145
+ class MangledIntersectionObserver extends orig {
146
+ constructor(callback, options) {
147
+ super((entries, observer) => {
148
+ const lastEntryMap = observer[kLastEntryMap];
149
+ const lastEntry = entries[entries.length - 1];
150
+ lastEntryMap.set(lastEntry.target, lastEntry);
151
+ return callback(entries, observer);
152
+ }, options);
153
+ this[kCallback] = callback;
154
+ this[kLastEntryMap] = new WeakMap();
155
+ liveObservers.set(this, new Set());
156
+ }
157
+ disconnect() {
158
+ liveObservers.get(this)?.clear();
159
+ liveObservers.delete(this);
160
+ return super.disconnect();
161
+ }
162
+ observe(target) {
163
+ const observer = liveObservers.get(this);
164
+ observer?.add(target);
165
+ return super.observe(target);
166
+ }
167
+ unobserve(target) {
168
+ const observer = liveObservers.get(this);
169
+ observer?.delete(target);
170
+ return super.unobserve(target);
171
+ }
172
+ }
173
+ Object.defineProperty(MangledIntersectionObserver, 'name', { value: 'IntersectionObserver', writable: false });
174
+ window.IntersectionObserver = MangledIntersectionObserver;
175
+ function simulateScroll() {
176
+ for (const [observer, targets] of liveObservers.entries()) {
177
+ const t0 = performance.now();
178
+ for (const target of targets) {
179
+ const entry = createIntersectionObserverEntry(target, true, t0);
180
+ observer[kCallback]([entry], observer);
181
+ setTimeout(() => {
182
+ const t1 = performance.now();
183
+ const lastEntry = observer[kLastEntryMap].get(target);
184
+ if (!lastEntry) {
185
+ return;
186
+ }
187
+ const entry2 = { ...cloneIntersectionObserverEntry(lastEntry), time: t1 };
188
+ observer[kCallback]([entry2], observer);
189
+ });
190
+ }
191
+ }
192
+ }
193
+ window.simulateScroll = simulateScroll;
194
+ })();
195
+ `;
196
+
197
+ const MUTATION_IDLE_WATCH = `
198
+ (function () {
199
+ let timeout;
200
+ const sendMsg = ()=> {
201
+ document.dispatchEvent(new CustomEvent('mutationIdle'));
202
+ };
203
+
204
+ const cb = () => {
205
+ if (timeout) {
206
+ clearTimeout(timeout);
207
+ timeout = setTimeout(sendMsg, 200);
208
+ }
209
+ };
210
+ const mutationObserver = new MutationObserver(cb);
211
+
212
+ document.addEventListener('DOMContentLoaded', () => {
213
+ mutationObserver.observe(document.documentElement, {
214
+ childList: true,
215
+ subtree: true,
216
+ });
217
+ timeout = setTimeout(sendMsg, 200);
218
+ }, { once: true })
219
+ })();
220
+ `;
221
+
222
+
223
  const SCRIPT_TO_INJECT_INTO_FRAME = `
224
  ${READABILITY_JS}
225
+ ${SIMULATE_SCROLL}
226
+ ${MUTATION_IDLE_WATCH}
227
 
228
+ (function(){
229
  function briefImgs(elem) {
230
  const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
231
 
 
399
 
400
  return r;
401
  }
402
+ function waitForSelector(selectorText) {
403
+ return new Promise((resolve) => {
404
+ const existing = document.querySelector(selectorText);
405
+ if (existing) {
406
+ resolve(existing);
407
+ return;
408
+ }
409
+ const observer = new MutationObserver(() => {
410
+ const elem = document.querySelector(selectorText);
411
+ if (elem) {
412
+ resolve(document.querySelector(selectorText));
413
+ observer.disconnect();
414
+ }
415
+ });
416
+ observer.observe(document.documentElement, {
417
+ childList: true,
418
+ subtree: true
419
+ });
420
+ });
421
+ }
422
+ window.waitForSelector = waitForSelector;
423
+ window.giveSnapshot = giveSnapshot;
424
+ window.briefImgs = briefImgs;
425
+ })();
426
  `;
427
 
428
  @singleton()
 
631
  });
632
 
633
  await page.evaluateOnNewDocument(`
634
+ (function () {
635
+ if (window.self === window.top) {
636
+ let lastTextLength = 0;
637
+ const handlePageLoad = () => {
638
+ const thisTextLength = (document.body.innerText || '').length;
639
+ const deltaLength = Math.abs(thisTextLength - lastTextLength);
640
+ if (10 * deltaLength < lastTextLength) {
641
+ // Change is not significant
642
+ return;
643
+ }
644
+ lastTextLength = thisTextLength;
645
+ if (window.haltSnapshot) {
646
+ return;
647
+ }
648
+ const r = giveSnapshot();
649
+ window.reportSnapshot(r);
650
+ };
651
+ document.addEventListener('readystatechange', handlePageLoad);
652
+ document.addEventListener('load', handlePageLoad);
653
+ document.addEventListener('mutationIdle', handlePageLoad);
654
+ }
655
+ document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
656
+ })();
657
  `);
658
 
659
  this.snMap.set(page, sn);
 
705
  await Promise.race([
706
  (async () => {
707
  const ctx = page.browserContext();
708
+ try {
709
+ await page.close();
710
+ } finally {
711
+ await ctx.close();
712
+ }
713
+ })(),
714
+ delay(5000)
715
  ]).catch((err) => {
716
  this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
717
  });
 
760
  return req.continue(continueArgs[0], continueArgs[1]);
761
  });
762
  }
763
+ let pageScriptEvaluations: Promise<unknown>[] = [];
764
+ let frameScriptEvaluations: Promise<unknown>[] = [];
765
+ if (options?.injectPageScripts?.length) {
766
+ page.on('framenavigated', (frame) => {
767
+ if (frame !== page.mainFrame()) {
768
+ return;
769
+ }
770
+
771
+ pageScriptEvaluations.push(
772
+ Promise.allSettled(options.injectPageScripts!.map((x) => frame.evaluate(x).catch((err) => {
773
+ this.logger.warn(`Error in evaluation of page scripts`, { err });
774
+ })))
775
+ );
776
+ });
777
+ }
778
+ if (options?.injectFrameScripts?.length) {
779
+ page.on('framenavigated', (frame) => {
780
+ frameScriptEvaluations.push(
781
+ Promise.allSettled(options.injectFrameScripts!.map((x) => frame.evaluate(x).catch((err) => {
782
+ this.logger.warn(`Error in evaluation of frame scripts`, { err });
783
+ })))
784
+ );
785
+ });
786
+ }
787
  const sn = this.snMap.get(page);
788
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
789
 
 
872
  goToOptions.referer = options.referer;
873
  }
874
 
875
+ const delayPromise = delay(timeout);
876
  const gotoPromise = page.goto(url, goToOptions)
877
  .catch((err) => {
878
  if (err instanceof TimeoutError) {
 
903
  throw stuff;
904
  }
905
  }
906
+ await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
907
+ .catch(() => void 0);
908
  try {
909
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
910
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -316,7 +316,7 @@ export class SnapshotFormatter extends AsyncService {
316
  }
317
  } while (false);
318
 
319
- const cleanText = (contentText || '').trim();
320
 
321
  const formatted: FormattedPage = {
322
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
 
316
  }
317
  } while (false);
318
 
319
+ const cleanText = contentText?.includes('return') ? contentText.trimEnd() : (contentText || '').trim();
320
 
321
  const formatted: FormattedPage = {
322
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),