nomagick commited on
Commit
43dee08
·
unverified ·
1 Parent(s): 908157b

security: detect abuse

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -5,7 +5,7 @@ import {
5
  AssertionFailureError, ParamValidationError, Defer,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
- import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@@ -22,6 +22,7 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
22
  import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
23
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
24
  import { PDFExtractor } from '../services/pdf-extract';
 
25
 
26
  const md5Hasher = new HashManager('md5', 'hex');
27
 
@@ -64,6 +65,7 @@ export class CrawlerHost extends RPCHost {
64
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
65
  cacheValidMs = 1000 * 3600;
66
  urlValidMs = 1000 * 3600 * 4;
 
67
 
68
  constructor(
69
  protected globalLogger: Logger,
@@ -87,6 +89,21 @@ export class CrawlerHost extends RPCHost {
87
 
88
  await this.setToCache(options.url, snapshot);
89
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  }
91
 
92
  override async init() {
@@ -617,6 +634,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
617
  path: 'url'
618
  });
619
  }
 
 
 
 
 
 
 
620
 
621
  const crawlOpts = this.configure(crawlerOptions);
622
 
 
5
  AssertionFailureError, ParamValidationError, Defer,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
+ import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
9
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
 
22
  import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
23
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
24
  import { PDFExtractor } from '../services/pdf-extract';
25
+ import { DomainBlockade } from '../db/domain-blockade';
26
 
27
  const md5Hasher = new HashManager('md5', 'hex');
28
 
 
65
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
66
  cacheValidMs = 1000 * 3600;
67
  urlValidMs = 1000 * 3600 * 4;
68
+ abuseBlockMs = 1000 * 3600 * 24;
69
 
70
  constructor(
71
  protected globalLogger: Logger,
 
89
 
90
  await this.setToCache(options.url, snapshot);
91
  });
92
+
93
+ puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
94
+ this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
95
+
96
+ await DomainBlockade.save(DomainBlockade.from({
97
+ domain: abuseEvent.url.hostname.toLowerCase(),
98
+ triggerReason: `${abuseEvent.reason}`,
99
+ triggerUrl: abuseEvent.url.toString(),
100
+ createdAt: new Date(),
101
+ expireAt: new Date(Date.now() + this.abuseBlockMs),
102
+ })).catch((err) => {
103
+ this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) });
104
+ });
105
+
106
+ });
107
  }
108
 
109
  override async init() {
 
634
  path: 'url'
635
  });
636
  }
637
+ const blockade = (await DomainBlockade.fromFirestoreQuery(
638
+ DomainBlockade.COLLECTION.where('domain', '==', urlToCrawl.hostname.toLowerCase()).limit(1)
639
+ ))[0];
640
+
641
+ if (blockade) {
642
+ throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
643
+ }
644
 
645
  const crawlOpts = this.configure(crawlerOptions);
646
 
backend/functions/src/db/domain-blockade.ts ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Also, Prop } from 'civkit';
2
+ import { FirestoreRecord } from '../shared/lib/firestore';
3
+
4
+ @Also({
5
+ dictOf: Object
6
+ })
7
+ export class DomainBlockade extends FirestoreRecord {
8
+ static override collectionName = 'domainBlockades';
9
+
10
+ override _id!: string;
11
+
12
+ @Prop({
13
+ required: true
14
+ })
15
+ domain!: string;
16
+
17
+ @Prop({ required: true })
18
+ triggerReason!: string;
19
+
20
+ @Prop()
21
+ triggerUrl?: string;
22
+
23
+ @Prop()
24
+ createdAt!: Date;
25
+
26
+ @Prop()
27
+ expireAt?: Date;
28
+
29
+ [k: string]: any;
30
+ }
backend/functions/src/services/puppeteer.ts CHANGED
@@ -10,7 +10,7 @@ import puppeteer from 'puppeteer-extra';
10
 
11
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
12
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
- import { ServiceCrashedError } from '../shared/lib/errors';
14
  import { Readability } from '@mozilla/readability';
15
 
16
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@@ -270,17 +270,36 @@ function giveSnapshot(stopActiveSnapshot) {
270
 
271
  await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
272
 
 
 
 
273
  page.on('request', (req) => {
 
274
  const requestUrl = req.url();
275
  if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
276
  return req.abort('blockedbyclient', 1000);
277
  }
278
  const parsedUrl = new URL(requestUrl);
 
279
 
280
  if (
281
  parsedUrl.hostname === 'localhost' ||
282
  parsedUrl.hostname.startsWith('127.')
283
  ) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  return req.abort('blockedbyclient', 1000);
285
  }
286
 
@@ -408,6 +427,12 @@ document.addEventListener('load', handlePageLoad);
408
  });
409
  };
410
  page.on('snapshot', hdl);
 
 
 
 
 
 
411
 
412
  const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
413
  .catch((err) => {
 
10
 
11
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
12
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
+ import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
14
  import { Readability } from '@mozilla/readability';
15
 
16
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
 
270
 
271
  await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
272
 
273
+ const domainSet = new Set<string>();
274
+ let reqCounter = 0;
275
+
276
  page.on('request', (req) => {
277
+ reqCounter++;
278
  const requestUrl = req.url();
279
  if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
280
  return req.abort('blockedbyclient', 1000);
281
  }
282
  const parsedUrl = new URL(requestUrl);
283
+ domainSet.add(parsedUrl.hostname);
284
 
285
  if (
286
  parsedUrl.hostname === 'localhost' ||
287
  parsedUrl.hostname.startsWith('127.')
288
  ) {
289
+ page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` });
290
+
291
+ return req.abort('blockedbyclient', 1000);
292
+ }
293
+
294
+ if (reqCounter > 200) {
295
+ page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests: ${reqCounter}` });
296
+
297
+ return req.abort('blockedbyclient', 1000);
298
+ }
299
+
300
+ if (domainSet.size > 21) {
301
+ page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains (${domainSet.size})` });
302
+
303
  return req.abort('blockedbyclient', 1000);
304
  }
305
 
 
427
  });
428
  };
429
  page.on('snapshot', hdl);
430
+ page.once('abuse', (event: any) => {
431
+ this.emit('abuse', { ...event, url: parsedUrl });
432
+ nextSnapshotDeferred.reject(
433
+ new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
434
+ );
435
+ });
436
 
437
  const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
438
  .catch((err) => {
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit a3a13b13fbef8e9f5d388bde6fca6b459e6f92a6
 
1
+ Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1