Spaces:
Build error
Build error
fix: dos abuse
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -881,7 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 881 |
|
| 882 |
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
| 883 |
} catch (err: any) {
|
| 884 |
-
if (cache) {
|
| 885 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 886 |
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 887 |
return;
|
|
|
|
| 881 |
|
| 882 |
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
| 883 |
} catch (err: any) {
|
| 884 |
+
if (cache && !(err instanceof SecurityCompromiseError)) {
|
| 885 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 886 |
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 887 |
return;
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -52,6 +52,7 @@ export interface PageSnapshot {
|
|
| 52 |
screenshot?: Buffer;
|
| 53 |
imgs?: ImgBrief[];
|
| 54 |
pdfs?: string[];
|
|
|
|
| 55 |
}
|
| 56 |
|
| 57 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
@@ -235,6 +236,32 @@ function briefPDFs() {
|
|
| 235 |
return x.src === 'about:blank' ? document.location.href : x.src;
|
| 236 |
});
|
| 237 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
function giveSnapshot(stopActiveSnapshot) {
|
| 239 |
if (stopActiveSnapshot) {
|
| 240 |
window.haltSnapshot = true;
|
|
@@ -254,6 +281,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 254 |
parsed: parsed,
|
| 255 |
imgs: [],
|
| 256 |
pdfs: briefPDFs(),
|
|
|
|
| 257 |
};
|
| 258 |
if (parsed && parsed.content) {
|
| 259 |
const elem = document.createElement('div');
|
|
@@ -277,7 +305,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 277 |
|
| 278 |
const domainSet = new Set<string>();
|
| 279 |
let reqCounter = 0;
|
| 280 |
-
|
| 281 |
let halt = false;
|
| 282 |
|
| 283 |
page.on('request', (req) => {
|
|
@@ -285,6 +313,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 285 |
if (halt) {
|
| 286 |
return req.abort('blockedbyclient', 1000);
|
| 287 |
}
|
|
|
|
| 288 |
const requestUrl = req.url();
|
| 289 |
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
| 290 |
return req.abort('blockedbyclient', 1000);
|
|
@@ -446,6 +475,10 @@ document.addEventListener('load', handlePageLoad);
|
|
| 446 |
if (snapshot === s) {
|
| 447 |
return;
|
| 448 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
snapshot = s;
|
| 450 |
nextSnapshotDeferred.resolve(s);
|
| 451 |
nextSnapshotDeferred = Defer();
|
|
@@ -516,7 +549,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 516 |
ckpt.push(delay(options.minIntervalMs));
|
| 517 |
}
|
| 518 |
let error;
|
| 519 |
-
await Promise.race(ckpt).catch((err)=> error = err);
|
| 520 |
if (finalized) {
|
| 521 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 522 |
break;
|
|
|
|
| 52 |
screenshot?: Buffer;
|
| 53 |
imgs?: ImgBrief[];
|
| 54 |
pdfs?: string[];
|
| 55 |
+
maxElemDepth?: number;
|
| 56 |
}
|
| 57 |
|
| 58 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
|
|
| 236 |
return x.src === 'about:blank' ? document.location.href : x.src;
|
| 237 |
});
|
| 238 |
}
|
| 239 |
+
function getMaxDepthUsingTreeWalker(root) {
|
| 240 |
+
let maxDepth = 0;
|
| 241 |
+
let currentDepth = 0;
|
| 242 |
+
|
| 243 |
+
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
|
| 244 |
+
|
| 245 |
+
while (true) {
|
| 246 |
+
maxDepth = Math.max(maxDepth, currentDepth);
|
| 247 |
+
|
| 248 |
+
if (treeWalker.firstChild()) {
|
| 249 |
+
currentDepth++;
|
| 250 |
+
} else {
|
| 251 |
+
while (!treeWalker.nextSibling() && currentDepth > 0) {
|
| 252 |
+
treeWalker.parentNode();
|
| 253 |
+
currentDepth--;
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
if (currentDepth <= 0) {
|
| 257 |
+
break;
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
return maxDepth + 1;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
function giveSnapshot(stopActiveSnapshot) {
|
| 266 |
if (stopActiveSnapshot) {
|
| 267 |
window.haltSnapshot = true;
|
|
|
|
| 281 |
parsed: parsed,
|
| 282 |
imgs: [],
|
| 283 |
pdfs: briefPDFs(),
|
| 284 |
+
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
|
| 285 |
};
|
| 286 |
if (parsed && parsed.content) {
|
| 287 |
const elem = document.createElement('div');
|
|
|
|
| 305 |
|
| 306 |
const domainSet = new Set<string>();
|
| 307 |
let reqCounter = 0;
|
| 308 |
+
let t0: number | undefined;
|
| 309 |
let halt = false;
|
| 310 |
|
| 311 |
page.on('request', (req) => {
|
|
|
|
| 313 |
if (halt) {
|
| 314 |
return req.abort('blockedbyclient', 1000);
|
| 315 |
}
|
| 316 |
+
t0 ??= Date.now();
|
| 317 |
const requestUrl = req.url();
|
| 318 |
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
| 319 |
return req.abort('blockedbyclient', 1000);
|
|
|
|
| 475 |
if (snapshot === s) {
|
| 476 |
return;
|
| 477 |
}
|
| 478 |
+
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
| 479 |
+
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
|
| 480 |
+
return;
|
| 481 |
+
}
|
| 482 |
snapshot = s;
|
| 483 |
nextSnapshotDeferred.resolve(s);
|
| 484 |
nextSnapshotDeferred = Defer();
|
|
|
|
| 549 |
ckpt.push(delay(options.minIntervalMs));
|
| 550 |
}
|
| 551 |
let error;
|
| 552 |
+
await Promise.race(ckpt).catch((err) => error = err);
|
| 553 |
if (finalized) {
|
| 554 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 555 |
break;
|