Spaces:
Build error
Build error
security: detect abuse
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
| 5 |
AssertionFailureError, ParamValidationError, Defer,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
-
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
@@ -22,6 +22,7 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
|
| 22 |
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
| 23 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 24 |
import { PDFExtractor } from '../services/pdf-extract';
|
|
|
|
| 25 |
|
| 26 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 27 |
|
|
@@ -64,6 +65,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 64 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 65 |
cacheValidMs = 1000 * 3600;
|
| 66 |
urlValidMs = 1000 * 3600 * 4;
|
|
|
|
| 67 |
|
| 68 |
constructor(
|
| 69 |
protected globalLogger: Logger,
|
|
@@ -87,6 +89,21 @@ export class CrawlerHost extends RPCHost {
|
|
| 87 |
|
| 88 |
await this.setToCache(options.url, snapshot);
|
| 89 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
}
|
| 91 |
|
| 92 |
override async init() {
|
|
@@ -617,6 +634,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 617 |
path: 'url'
|
| 618 |
});
|
| 619 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
|
| 621 |
const crawlOpts = this.configure(crawlerOptions);
|
| 622 |
|
|
|
|
| 5 |
AssertionFailureError, ParamValidationError, Defer,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
+
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
| 9 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
|
|
| 22 |
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
| 23 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 24 |
import { PDFExtractor } from '../services/pdf-extract';
|
| 25 |
+
import { DomainBlockade } from '../db/domain-blockade';
|
| 26 |
|
| 27 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 28 |
|
|
|
|
| 65 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 66 |
cacheValidMs = 1000 * 3600;
|
| 67 |
urlValidMs = 1000 * 3600 * 4;
|
| 68 |
+
abuseBlockMs = 1000 * 3600 * 24;
|
| 69 |
|
| 70 |
constructor(
|
| 71 |
protected globalLogger: Logger,
|
|
|
|
| 89 |
|
| 90 |
await this.setToCache(options.url, snapshot);
|
| 91 |
});
|
| 92 |
+
|
| 93 |
+
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
| 94 |
+
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
|
| 95 |
+
|
| 96 |
+
await DomainBlockade.save(DomainBlockade.from({
|
| 97 |
+
domain: abuseEvent.url.hostname.toLowerCase(),
|
| 98 |
+
triggerReason: `${abuseEvent.reason}`,
|
| 99 |
+
triggerUrl: abuseEvent.url.toString(),
|
| 100 |
+
createdAt: new Date(),
|
| 101 |
+
expireAt: new Date(Date.now() + this.abuseBlockMs),
|
| 102 |
+
})).catch((err) => {
|
| 103 |
+
this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) });
|
| 104 |
+
});
|
| 105 |
+
|
| 106 |
+
});
|
| 107 |
}
|
| 108 |
|
| 109 |
override async init() {
|
|
|
|
| 634 |
path: 'url'
|
| 635 |
});
|
| 636 |
}
|
| 637 |
+
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
| 638 |
+
DomainBlockade.COLLECTION.where('domain', '==', urlToCrawl.hostname.toLowerCase()).limit(1)
|
| 639 |
+
))[0];
|
| 640 |
+
|
| 641 |
+
if (blockade) {
|
| 642 |
+
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 643 |
+
}
|
| 644 |
|
| 645 |
const crawlOpts = this.configure(crawlerOptions);
|
| 646 |
|
backend/functions/src/db/domain-blockade.ts
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Also, Prop } from 'civkit';
|
| 2 |
+
import { FirestoreRecord } from '../shared/lib/firestore';
|
| 3 |
+
|
| 4 |
+
@Also({
|
| 5 |
+
dictOf: Object
|
| 6 |
+
})
|
| 7 |
+
export class DomainBlockade extends FirestoreRecord {
|
| 8 |
+
static override collectionName = 'domainBlockades';
|
| 9 |
+
|
| 10 |
+
override _id!: string;
|
| 11 |
+
|
| 12 |
+
@Prop({
|
| 13 |
+
required: true
|
| 14 |
+
})
|
| 15 |
+
domain!: string;
|
| 16 |
+
|
| 17 |
+
@Prop({ required: true })
|
| 18 |
+
triggerReason!: string;
|
| 19 |
+
|
| 20 |
+
@Prop()
|
| 21 |
+
triggerUrl?: string;
|
| 22 |
+
|
| 23 |
+
@Prop()
|
| 24 |
+
createdAt!: Date;
|
| 25 |
+
|
| 26 |
+
@Prop()
|
| 27 |
+
expireAt?: Date;
|
| 28 |
+
|
| 29 |
+
[k: string]: any;
|
| 30 |
+
}
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -10,7 +10,7 @@ import puppeteer from 'puppeteer-extra';
|
|
| 10 |
|
| 11 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
| 12 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 13 |
-
import { ServiceCrashedError } from '../shared/lib/errors';
|
| 14 |
import { Readability } from '@mozilla/readability';
|
| 15 |
|
| 16 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
@@ -270,17 +270,36 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 270 |
|
| 271 |
await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
|
| 272 |
|
|
|
|
|
|
|
|
|
|
| 273 |
page.on('request', (req) => {
|
|
|
|
| 274 |
const requestUrl = req.url();
|
| 275 |
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
| 276 |
return req.abort('blockedbyclient', 1000);
|
| 277 |
}
|
| 278 |
const parsedUrl = new URL(requestUrl);
|
|
|
|
| 279 |
|
| 280 |
if (
|
| 281 |
parsedUrl.hostname === 'localhost' ||
|
| 282 |
parsedUrl.hostname.startsWith('127.')
|
| 283 |
) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
return req.abort('blockedbyclient', 1000);
|
| 285 |
}
|
| 286 |
|
|
@@ -408,6 +427,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 408 |
});
|
| 409 |
};
|
| 410 |
page.on('snapshot', hdl);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
| 413 |
.catch((err) => {
|
|
|
|
| 10 |
|
| 11 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
| 12 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 13 |
+
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 14 |
import { Readability } from '@mozilla/readability';
|
| 15 |
|
| 16 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
|
|
| 270 |
|
| 271 |
await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
|
| 272 |
|
| 273 |
+
const domainSet = new Set<string>();
|
| 274 |
+
let reqCounter = 0;
|
| 275 |
+
|
| 276 |
page.on('request', (req) => {
|
| 277 |
+
reqCounter++;
|
| 278 |
const requestUrl = req.url();
|
| 279 |
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
| 280 |
return req.abort('blockedbyclient', 1000);
|
| 281 |
}
|
| 282 |
const parsedUrl = new URL(requestUrl);
|
| 283 |
+
domainSet.add(parsedUrl.hostname);
|
| 284 |
|
| 285 |
if (
|
| 286 |
parsedUrl.hostname === 'localhost' ||
|
| 287 |
parsedUrl.hostname.startsWith('127.')
|
| 288 |
) {
|
| 289 |
+
page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` });
|
| 290 |
+
|
| 291 |
+
return req.abort('blockedbyclient', 1000);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
if (reqCounter > 200) {
|
| 295 |
+
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests: ${reqCounter}` });
|
| 296 |
+
|
| 297 |
+
return req.abort('blockedbyclient', 1000);
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
if (domainSet.size > 21) {
|
| 301 |
+
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains (${domainSet.size})` });
|
| 302 |
+
|
| 303 |
return req.abort('blockedbyclient', 1000);
|
| 304 |
}
|
| 305 |
|
|
|
|
| 427 |
});
|
| 428 |
};
|
| 429 |
page.on('snapshot', hdl);
|
| 430 |
+
page.once('abuse', (event: any) => {
|
| 431 |
+
this.emit('abuse', { ...event, url: parsedUrl });
|
| 432 |
+
nextSnapshotDeferred.reject(
|
| 433 |
+
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
| 434 |
+
);
|
| 435 |
+
});
|
| 436 |
|
| 437 |
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
| 438 |
.catch((err) => {
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1
|