nomagick commited on
Commit
f0668a9
·
unverified ·
1 Parent(s): be91371

fix: potential circular crawling

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -581,6 +581,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
581
  { contentType: 'text/plain', envelope: null }
582
  );
583
  }
 
 
 
 
 
 
584
  if (uid) {
585
  const user = await auth.assertUser();
586
  if (!(user.wallet.total_balance > 0)) {
@@ -638,15 +644,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
638
  path: 'url'
639
  });
640
  }
641
- const blockade = (await DomainBlockade.fromFirestoreQuery(
642
- DomainBlockade.COLLECTION
643
- .where('domain', '==', urlToCrawl.hostname.toLowerCase())
644
- .where('expireAt', '>=', new Date())
645
- .limit(1)
646
- ))[0];
647
 
648
- if (blockade && !uid) {
649
- throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
 
 
 
 
 
 
 
 
650
  }
651
 
652
  const crawlOpts = this.configure(crawlerOptions);
 
581
  { contentType: 'text/plain', envelope: null }
582
  );
583
  }
584
+
585
+ // Prevent circular crawling
586
+ this.puppeteerControl.circuitBreakerHosts.add(
587
+ ctx.req.hostname.toLowerCase()
588
+ );
589
+
590
  if (uid) {
591
  const user = await auth.assertUser();
592
  if (!(user.wallet.total_balance > 0)) {
 
644
  path: 'url'
645
  });
646
  }
 
 
 
 
 
 
647
 
648
+ if (!uid) {
649
+ const blockade = (await DomainBlockade.fromFirestoreQuery(
650
+ DomainBlockade.COLLECTION
651
+ .where('domain', '==', urlToCrawl.hostname.toLowerCase())
652
+ .where('expireAt', '>=', new Date())
653
+ .limit(1)
654
+ ))[0];
655
+ if (blockade) {
656
+ throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
657
+ }
658
  }
659
 
660
  const crawlOpts = this.configure(crawlerOptions);
backend/functions/src/services/puppeteer.ts CHANGED
@@ -100,6 +100,8 @@ export class PuppeteerControl extends AsyncService {
100
  livePages = new Set<Page>();
101
  lastPageCratedAt: number = 0;
102
 
 
 
103
  constructor(
104
  protected globalLogger: Logger,
105
  ) {
@@ -285,6 +287,12 @@ function giveSnapshot(stopActiveSnapshot) {
285
 
286
  const parsedUrl = new URL(requestUrl);
287
 
 
 
 
 
 
 
288
  if (
289
  parsedUrl.hostname === 'localhost' ||
290
  parsedUrl.hostname.startsWith('127.')
 
100
  livePages = new Set<Page>();
101
  lastPageCratedAt: number = 0;
102
 
103
+ circuitBreakerHosts: Set<string> = new Set();
104
+
105
  constructor(
106
  protected globalLogger: Logger,
107
  ) {
 
287
 
288
  const parsedUrl = new URL(requestUrl);
289
 
290
+ if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
291
+ page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
292
+
293
+ return req.abort('blockedbyclient', 1000);
294
+ }
295
+
296
  if (
297
  parsedUrl.hostname === 'localhost' ||
298
  parsedUrl.hostname.startsWith('127.')