nomagick commited on
Commit
1bcb5a7
·
unverified ·
1 Parent(s): 57641c4

fix: dos abuse

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -881,7 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
881
 
882
  yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
883
  } catch (err: any) {
884
- if (cache) {
885
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
886
  yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
887
  return;
 
881
 
882
  yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
883
  } catch (err: any) {
884
+ if (cache && !(err instanceof SecurityCompromiseError)) {
885
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
886
  yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
887
  return;
backend/functions/src/services/puppeteer.ts CHANGED
@@ -52,6 +52,7 @@ export interface PageSnapshot {
52
  screenshot?: Buffer;
53
  imgs?: ImgBrief[];
54
  pdfs?: string[];
 
55
  }
56
 
57
  export interface ExtendedSnapshot extends PageSnapshot {
@@ -235,6 +236,32 @@ function briefPDFs() {
235
  return x.src === 'about:blank' ? document.location.href : x.src;
236
  });
237
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  function giveSnapshot(stopActiveSnapshot) {
239
  if (stopActiveSnapshot) {
240
  window.haltSnapshot = true;
@@ -254,6 +281,7 @@ function giveSnapshot(stopActiveSnapshot) {
254
  parsed: parsed,
255
  imgs: [],
256
  pdfs: briefPDFs(),
 
257
  };
258
  if (parsed && parsed.content) {
259
  const elem = document.createElement('div');
@@ -277,7 +305,7 @@ function giveSnapshot(stopActiveSnapshot) {
277
 
278
  const domainSet = new Set<string>();
279
  let reqCounter = 0;
280
- const t0 = Date.now();
281
  let halt = false;
282
 
283
  page.on('request', (req) => {
@@ -285,6 +313,7 @@ function giveSnapshot(stopActiveSnapshot) {
285
  if (halt) {
286
  return req.abort('blockedbyclient', 1000);
287
  }
 
288
  const requestUrl = req.url();
289
  if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
290
  return req.abort('blockedbyclient', 1000);
@@ -446,6 +475,10 @@ document.addEventListener('load', handlePageLoad);
446
  if (snapshot === s) {
447
  return;
448
  }
 
 
 
 
449
  snapshot = s;
450
  nextSnapshotDeferred.resolve(s);
451
  nextSnapshotDeferred = Defer();
@@ -516,7 +549,7 @@ document.addEventListener('load', handlePageLoad);
516
  ckpt.push(delay(options.minIntervalMs));
517
  }
518
  let error;
519
- await Promise.race(ckpt).catch((err)=> error = err);
520
  if (finalized) {
521
  yield { ...snapshot, screenshot } as PageSnapshot;
522
  break;
 
52
  screenshot?: Buffer;
53
  imgs?: ImgBrief[];
54
  pdfs?: string[];
55
+ maxElemDepth?: number;
56
  }
57
 
58
  export interface ExtendedSnapshot extends PageSnapshot {
 
236
  return x.src === 'about:blank' ? document.location.href : x.src;
237
  });
238
  }
239
+ function getMaxDepthUsingTreeWalker(root) {
240
+ let maxDepth = 0;
241
+ let currentDepth = 0;
242
+
243
+ const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
244
+
245
+ while (true) {
246
+ maxDepth = Math.max(maxDepth, currentDepth);
247
+
248
+ if (treeWalker.firstChild()) {
249
+ currentDepth++;
250
+ } else {
251
+ while (!treeWalker.nextSibling() && currentDepth > 0) {
252
+ treeWalker.parentNode();
253
+ currentDepth--;
254
+ }
255
+
256
+ if (currentDepth <= 0) {
257
+ break;
258
+ }
259
+ }
260
+ }
261
+
262
+ return maxDepth + 1;
263
+ }
264
+
265
  function giveSnapshot(stopActiveSnapshot) {
266
  if (stopActiveSnapshot) {
267
  window.haltSnapshot = true;
 
281
  parsed: parsed,
282
  imgs: [],
283
  pdfs: briefPDFs(),
284
+ maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
285
  };
286
  if (parsed && parsed.content) {
287
  const elem = document.createElement('div');
 
305
 
306
  const domainSet = new Set<string>();
307
  let reqCounter = 0;
308
+ let t0: number | undefined;
309
  let halt = false;
310
 
311
  page.on('request', (req) => {
 
313
  if (halt) {
314
  return req.abort('blockedbyclient', 1000);
315
  }
316
+ t0 ??= Date.now();
317
  const requestUrl = req.url();
318
  if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
319
  return req.abort('blockedbyclient', 1000);
 
475
  if (snapshot === s) {
476
  return;
477
  }
478
+ if (s?.maxElemDepth && s.maxElemDepth > 256) {
479
+ page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
480
+ return;
481
+ }
482
  snapshot = s;
483
  nextSnapshotDeferred.resolve(s);
484
  nextSnapshotDeferred = Defer();
 
549
  ckpt.push(delay(options.minIntervalMs));
550
  }
551
  let error;
552
+ await Promise.race(ckpt).catch((err) => error = err);
553
  if (finalized) {
554
  yield { ...snapshot, screenshot } as PageSnapshot;
555
  break;