nomagick commited on
Commit
f7f6a98
·
unverified ·
1 Parent(s): f3654a7

fix: readerlm params

Browse files
Files changed (1) hide show
  1. src/api/crawler.ts +11 -8
src/api/crawler.ts CHANGED
@@ -338,7 +338,7 @@ export class CrawlerHost extends RPCHost {
338
  }
339
 
340
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
341
- chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
342
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
343
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
344
  }
@@ -379,7 +379,7 @@ export class CrawlerHost extends RPCHost {
379
  }
380
 
381
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
382
- chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
383
 
384
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
385
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -405,7 +405,7 @@ export class CrawlerHost extends RPCHost {
405
  }
406
 
407
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
408
- chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
409
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
410
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
411
  }
@@ -434,7 +434,7 @@ export class CrawlerHost extends RPCHost {
434
  }
435
 
436
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
437
- chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
438
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
439
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
440
  }
@@ -466,7 +466,7 @@ export class CrawlerHost extends RPCHost {
466
  }
467
 
468
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
469
- chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
470
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
471
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
472
  }
@@ -674,7 +674,10 @@ export class CrawlerHost extends RPCHost {
674
  const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
675
  ...crawlOpts,
676
  engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
677
- }, crawlerOpts);
 
 
 
678
 
679
  if (!finalAutoSnapshot?.html) {
680
  throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
@@ -890,7 +893,7 @@ export class CrawlerHost extends RPCHost {
890
  }
891
  }
892
 
893
- assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
894
  if (!formatted) {
895
  return 0;
896
  }
@@ -898,7 +901,7 @@ export class CrawlerHost extends RPCHost {
898
  let amount = 0;
899
  if (formatted.content) {
900
  const x1 = estimateToken(formatted.content);
901
- if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
902
  amount += x1 * 2;
903
  }
904
  amount += x1;
 
338
  }
339
 
340
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
341
+ chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
342
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
343
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
344
  }
 
379
  }
380
 
381
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
382
+ chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
383
 
384
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
385
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
405
  }
406
 
407
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
408
+ chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
409
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
410
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
411
  }
 
434
  }
435
 
436
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
437
+ chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
438
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
439
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
440
  }
 
466
  }
467
 
468
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
469
+ chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
470
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
471
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
472
  }
 
674
  const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
675
  ...crawlOpts,
676
  engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
677
+ }, CrawlerOptions.from({
678
+ ...crawlerOpts,
679
+ respondWith: 'html',
680
+ }));
681
 
682
  if (!finalAutoSnapshot?.html) {
683
  throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
 
893
  }
894
  }
895
 
896
+ assignChargeAmount(formatted: FormattedPage, crawlerOptions?: CrawlerOptions) {
897
  if (!formatted) {
898
  return 0;
899
  }
 
901
  let amount = 0;
902
  if (formatted.content) {
903
  const x1 = estimateToken(formatted.content);
904
+ if (crawlerOptions?.respondWith?.toLowerCase().includes('lm')) {
905
  amount += x1 * 2;
906
  }
907
  amount += x1;