Spaces:
Build error
Build error
fix: cache use and edge cases
Browse files- src/api/crawler.ts +62 -41
src/api/crawler.ts
CHANGED
|
@@ -107,6 +107,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 107 |
// Potentially mangeled content, dont cache if scripts are injected
|
| 108 |
return;
|
| 109 |
}
|
|
|
|
|
|
|
|
|
|
| 110 |
if (options.locale) {
|
| 111 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 112 |
}
|
|
@@ -360,27 +363,36 @@ export class CrawlerHost extends RPCHost {
|
|
| 360 |
|
| 361 |
let lastScrapped;
|
| 362 |
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
| 374 |
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
|
| 383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
}
|
| 385 |
|
| 386 |
if (!lastScrapped) {
|
|
@@ -406,33 +418,42 @@ export class CrawlerHost extends RPCHost {
|
|
| 406 |
});
|
| 407 |
}
|
| 408 |
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
-
|
| 425 |
-
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 426 |
-
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 427 |
-
);
|
| 428 |
}
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
);
|
| 433 |
}
|
| 434 |
-
|
| 435 |
-
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
|
| 436 |
}
|
| 437 |
|
| 438 |
if (!lastScrapped) {
|
|
@@ -733,7 +754,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 733 |
|
| 734 |
let cache = (await cacheIt.next()).value;
|
| 735 |
if (cache?.htmlSignificantlyModifiedByJs === false) {
|
| 736 |
-
if (crawlerOpts) {
|
| 737 |
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
|
| 738 |
}
|
| 739 |
}
|
|
|
|
| 107 |
// Potentially mangeled content, dont cache if scripts are injected
|
| 108 |
return;
|
| 109 |
}
|
| 110 |
+
if (snapshot.isIntermediate) {
|
| 111 |
+
return;
|
| 112 |
+
}
|
| 113 |
if (options.locale) {
|
| 114 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 115 |
}
|
|
|
|
| 363 |
|
| 364 |
let lastScrapped;
|
| 365 |
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 366 |
+
try {
|
| 367 |
+
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 368 |
+
lastScrapped = scrapped;
|
| 369 |
+
if (rpcReflect.signal.aborted) {
|
| 370 |
+
break;
|
| 371 |
+
}
|
| 372 |
+
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
| 373 |
+
continue;
|
| 374 |
+
}
|
| 375 |
+
if (!scrapped.title) {
|
| 376 |
+
continue;
|
| 377 |
+
}
|
| 378 |
|
| 379 |
+
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 380 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 381 |
|
| 382 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 383 |
+
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 384 |
+
}
|
| 385 |
|
| 386 |
+
if (scrapped?.pdfs?.length && !chargeAmount) {
|
| 387 |
+
continue;
|
| 388 |
+
}
|
| 389 |
|
| 390 |
+
return formatted;
|
| 391 |
+
}
|
| 392 |
+
} catch (err) {
|
| 393 |
+
if (!lastScrapped) {
|
| 394 |
+
throw err;
|
| 395 |
+
}
|
| 396 |
}
|
| 397 |
|
| 398 |
if (!lastScrapped) {
|
|
|
|
| 418 |
});
|
| 419 |
}
|
| 420 |
|
| 421 |
+
try {
|
| 422 |
+
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 423 |
+
lastScrapped = scrapped;
|
| 424 |
+
if (rpcReflect.signal.aborted) {
|
| 425 |
+
break;
|
| 426 |
+
}
|
| 427 |
+
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
| 428 |
+
continue;
|
| 429 |
+
}
|
| 430 |
+
if (!scrapped.title) {
|
| 431 |
+
continue;
|
| 432 |
+
}
|
| 433 |
|
| 434 |
+
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 435 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 436 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 437 |
+
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 441 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 442 |
+
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 443 |
+
);
|
| 444 |
+
}
|
| 445 |
+
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 446 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 447 |
+
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 448 |
+
);
|
| 449 |
+
}
|
| 450 |
|
| 451 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
|
|
|
|
|
|
|
|
|
|
| 452 |
}
|
| 453 |
+
} catch (err) {
|
| 454 |
+
if (!lastScrapped) {
|
| 455 |
+
throw err;
|
|
|
|
| 456 |
}
|
|
|
|
|
|
|
| 457 |
}
|
| 458 |
|
| 459 |
if (!lastScrapped) {
|
|
|
|
| 754 |
|
| 755 |
let cache = (await cacheIt.next()).value;
|
| 756 |
if (cache?.htmlSignificantlyModifiedByJs === false) {
|
| 757 |
+
if (crawlerOpts && crawlerOpts.timeout === undefined) {
|
| 758 |
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
|
| 759 |
}
|
| 760 |
}
|