Spaces:
Build error
Build error
fix: readerlm params
Browse files- src/api/crawler.ts +11 -8
src/api/crawler.ts
CHANGED
|
@@ -338,7 +338,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 338 |
}
|
| 339 |
|
| 340 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 341 |
-
chargeAmount = this.assignChargeAmount(formatted,
|
| 342 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 343 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 344 |
}
|
|
@@ -379,7 +379,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 379 |
}
|
| 380 |
|
| 381 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 382 |
-
chargeAmount = this.assignChargeAmount(formatted,
|
| 383 |
|
| 384 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 385 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
@@ -405,7 +405,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 405 |
}
|
| 406 |
|
| 407 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 408 |
-
chargeAmount = this.assignChargeAmount(formatted,
|
| 409 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 410 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 411 |
}
|
|
@@ -434,7 +434,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 434 |
}
|
| 435 |
|
| 436 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 437 |
-
chargeAmount = this.assignChargeAmount(formatted,
|
| 438 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 439 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 440 |
}
|
|
@@ -466,7 +466,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 466 |
}
|
| 467 |
|
| 468 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 469 |
-
chargeAmount = this.assignChargeAmount(formatted,
|
| 470 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 471 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 472 |
}
|
|
@@ -674,7 +674,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 674 |
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 675 |
...crawlOpts,
|
| 676 |
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
|
| 677 |
-
},
|
|
|
|
|
|
|
|
|
|
| 678 |
|
| 679 |
if (!finalAutoSnapshot?.html) {
|
| 680 |
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
|
|
@@ -890,7 +893,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 890 |
}
|
| 891 |
}
|
| 892 |
|
| 893 |
-
assignChargeAmount(formatted: FormattedPage,
|
| 894 |
if (!formatted) {
|
| 895 |
return 0;
|
| 896 |
}
|
|
@@ -898,7 +901,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 898 |
let amount = 0;
|
| 899 |
if (formatted.content) {
|
| 900 |
const x1 = estimateToken(formatted.content);
|
| 901 |
-
if (
|
| 902 |
amount += x1 * 2;
|
| 903 |
}
|
| 904 |
amount += x1;
|
|
|
|
| 338 |
}
|
| 339 |
|
| 340 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 341 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
| 342 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 343 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 344 |
}
|
|
|
|
| 379 |
}
|
| 380 |
|
| 381 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 382 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
| 383 |
|
| 384 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 385 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
|
|
| 405 |
}
|
| 406 |
|
| 407 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 408 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
| 409 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 410 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 411 |
}
|
|
|
|
| 434 |
}
|
| 435 |
|
| 436 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 437 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
| 438 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 439 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 440 |
}
|
|
|
|
| 466 |
}
|
| 467 |
|
| 468 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 469 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
| 470 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 471 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 472 |
}
|
|
|
|
| 674 |
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 675 |
...crawlOpts,
|
| 676 |
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
|
| 677 |
+
}, CrawlerOptions.from({
|
| 678 |
+
...crawlerOpts,
|
| 679 |
+
respondWith: 'html',
|
| 680 |
+
}));
|
| 681 |
|
| 682 |
if (!finalAutoSnapshot?.html) {
|
| 683 |
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
|
|
|
|
| 893 |
}
|
| 894 |
}
|
| 895 |
|
| 896 |
+
assignChargeAmount(formatted: FormattedPage, crawlerOptions?: CrawlerOptions) {
|
| 897 |
if (!formatted) {
|
| 898 |
return 0;
|
| 899 |
}
|
|
|
|
| 901 |
let amount = 0;
|
| 902 |
if (formatted.content) {
|
| 903 |
const x1 = estimateToken(formatted.content);
|
| 904 |
+
if (crawlerOptions?.respondWith?.toLowerCase().includes('lm')) {
|
| 905 |
amount += x1 * 2;
|
| 906 |
}
|
| 907 |
amount += x1;
|