nomagick commited on
Commit
3a40db2
·
unverified ·
1 Parent(s): b4b99f0

fix: cache use and edge cases

Browse files
Files changed (1) hide show
  1. src/api/crawler.ts +62 -41
src/api/crawler.ts CHANGED
@@ -107,6 +107,9 @@ export class CrawlerHost extends RPCHost {
107
  // Potentially mangeled content, dont cache if scripts are injected
108
  return;
109
  }
 
 
 
110
  if (options.locale) {
111
  Reflect.set(snapshot, 'locale', options.locale);
112
  }
@@ -360,27 +363,36 @@ export class CrawlerHost extends RPCHost {
360
 
361
  let lastScrapped;
362
  if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
363
- for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
364
- lastScrapped = scrapped;
365
- if (rpcReflect.signal.aborted) {
366
- break;
367
- }
368
- if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
369
- continue;
370
- }
 
 
 
 
371
 
372
- const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
373
- chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
374
 
375
- if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
376
- throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
377
- }
378
 
379
- if (scrapped?.pdfs?.length && !chargeAmount) {
380
- continue;
381
- }
382
 
383
- return formatted;
 
 
 
 
 
384
  }
385
 
386
  if (!lastScrapped) {
@@ -406,33 +418,42 @@ export class CrawlerHost extends RPCHost {
406
  });
407
  }
408
 
409
- for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
410
- lastScrapped = scrapped;
411
- if (rpcReflect.signal.aborted) {
412
- break;
413
- }
414
- if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
415
- continue;
416
- }
 
 
 
 
417
 
418
- const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
419
- chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
420
- if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
421
- throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
422
- }
 
 
 
 
 
 
 
 
 
 
 
423
 
424
- if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
425
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
426
- { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
427
- );
428
  }
429
- if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
430
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
431
- { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
432
- );
433
  }
434
-
435
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
436
  }
437
 
438
  if (!lastScrapped) {
@@ -733,7 +754,7 @@ export class CrawlerHost extends RPCHost {
733
 
734
  let cache = (await cacheIt.next()).value;
735
  if (cache?.htmlSignificantlyModifiedByJs === false) {
736
- if (crawlerOpts) {
737
  crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
738
  }
739
  }
 
107
  // Potentially mangeled content, dont cache if scripts are injected
108
  return;
109
  }
110
+ if (snapshot.isIntermediate) {
111
+ return;
112
+ }
113
  if (options.locale) {
114
  Reflect.set(snapshot, 'locale', options.locale);
115
  }
 
363
 
364
  let lastScrapped;
365
  if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
366
+ try {
367
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
368
+ lastScrapped = scrapped;
369
+ if (rpcReflect.signal.aborted) {
370
+ break;
371
+ }
372
+ if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
373
+ continue;
374
+ }
375
+ if (!scrapped.title) {
376
+ continue;
377
+ }
378
 
379
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
380
+ chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
381
 
382
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
383
+ throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
384
+ }
385
 
386
+ if (scrapped?.pdfs?.length && !chargeAmount) {
387
+ continue;
388
+ }
389
 
390
+ return formatted;
391
+ }
392
+ } catch (err) {
393
+ if (!lastScrapped) {
394
+ throw err;
395
+ }
396
  }
397
 
398
  if (!lastScrapped) {
 
418
  });
419
  }
420
 
421
+ try {
422
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
423
+ lastScrapped = scrapped;
424
+ if (rpcReflect.signal.aborted) {
425
+ break;
426
+ }
427
+ if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
428
+ continue;
429
+ }
430
+ if (!scrapped.title) {
431
+ continue;
432
+ }
433
 
434
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
435
+ chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
436
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
437
+ throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
438
+ }
439
+
440
+ if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
441
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
442
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
443
+ );
444
+ }
445
+ if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
446
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
447
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
448
+ );
449
+ }
450
 
451
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
 
 
 
452
  }
453
+ } catch (err) {
454
+ if (!lastScrapped) {
455
+ throw err;
 
456
  }
 
 
457
  }
458
 
459
  if (!lastScrapped) {
 
754
 
755
  let cache = (await cacheIt.next()).value;
756
  if (cache?.htmlSignificantlyModifiedByJs === false) {
757
+ if (crawlerOpts && crawlerOpts.timeout === undefined) {
758
  crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
759
  }
760
  }