Spaces:
Build error
Build error
fix: potential circular crawling
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -581,6 +581,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 581 |
{ contentType: 'text/plain', envelope: null }
|
| 582 |
);
|
| 583 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
if (uid) {
|
| 585 |
const user = await auth.assertUser();
|
| 586 |
if (!(user.wallet.total_balance > 0)) {
|
|
@@ -638,15 +644,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 638 |
path: 'url'
|
| 639 |
});
|
| 640 |
}
|
| 641 |
-
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
| 642 |
-
DomainBlockade.COLLECTION
|
| 643 |
-
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
|
| 644 |
-
.where('expireAt', '>=', new Date())
|
| 645 |
-
.limit(1)
|
| 646 |
-
))[0];
|
| 647 |
|
| 648 |
-
if (
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
}
|
| 651 |
|
| 652 |
const crawlOpts = this.configure(crawlerOptions);
|
|
|
|
| 581 |
{ contentType: 'text/plain', envelope: null }
|
| 582 |
);
|
| 583 |
}
|
| 584 |
+
|
| 585 |
+
// Prevent circular crawling
|
| 586 |
+
this.puppeteerControl.circuitBreakerHosts.add(
|
| 587 |
+
ctx.req.hostname.toLowerCase()
|
| 588 |
+
);
|
| 589 |
+
|
| 590 |
if (uid) {
|
| 591 |
const user = await auth.assertUser();
|
| 592 |
if (!(user.wallet.total_balance > 0)) {
|
|
|
|
| 644 |
path: 'url'
|
| 645 |
});
|
| 646 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
|
| 648 |
+
if (!uid) {
|
| 649 |
+
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
| 650 |
+
DomainBlockade.COLLECTION
|
| 651 |
+
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
|
| 652 |
+
.where('expireAt', '>=', new Date())
|
| 653 |
+
.limit(1)
|
| 654 |
+
))[0];
|
| 655 |
+
if (blockade) {
|
| 656 |
+
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 657 |
+
}
|
| 658 |
}
|
| 659 |
|
| 660 |
const crawlOpts = this.configure(crawlerOptions);
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -100,6 +100,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 100 |
livePages = new Set<Page>();
|
| 101 |
lastPageCratedAt: number = 0;
|
| 102 |
|
|
|
|
|
|
|
| 103 |
constructor(
|
| 104 |
protected globalLogger: Logger,
|
| 105 |
) {
|
|
@@ -285,6 +287,12 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 285 |
|
| 286 |
const parsedUrl = new URL(requestUrl);
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
if (
|
| 289 |
parsedUrl.hostname === 'localhost' ||
|
| 290 |
parsedUrl.hostname.startsWith('127.')
|
|
|
|
| 100 |
livePages = new Set<Page>();
|
| 101 |
lastPageCratedAt: number = 0;
|
| 102 |
|
| 103 |
+
circuitBreakerHosts: Set<string> = new Set();
|
| 104 |
+
|
| 105 |
constructor(
|
| 106 |
protected globalLogger: Logger,
|
| 107 |
) {
|
|
|
|
| 287 |
|
| 288 |
const parsedUrl = new URL(requestUrl);
|
| 289 |
|
| 290 |
+
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
| 291 |
+
page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
|
| 292 |
+
|
| 293 |
+
return req.abort('blockedbyclient', 1000);
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
if (
|
| 297 |
parsedUrl.hostname === 'localhost' ||
|
| 298 |
parsedUrl.hostname.startsWith('127.')
|