nomagick commited on
Commit
2a30fce
·
unverified ·
1 Parent(s): 3a40db2

fix: bring back content based respond timing

Browse files
Files changed (2) hide show
  1. src/api/crawler.ts +10 -7
  2. src/dto/crawler-options.ts +11 -6
src/api/crawler.ts CHANGED
@@ -793,7 +793,7 @@ export class CrawlerHost extends RPCHost {
793
  if (!sideLoaded.file) {
794
  throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
795
  }
796
- let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(
797
  urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
798
  ).catch((err) => {
799
  if (err instanceof ApplicationError) {
@@ -809,6 +809,9 @@ export class CrawlerHost extends RPCHost {
809
  let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
810
  draftSnapshot.title ??= analyzed.title;
811
  draftSnapshot.isIntermediate = true;
 
 
 
812
  let fallbackProxyIsUsed = false;
813
  if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
814
  (analyzed.tokens < 42 || sideLoaded.status !== 200)
@@ -825,19 +828,19 @@ export class CrawlerHost extends RPCHost {
825
  }
826
  return Promise.reject(err);
827
  });
 
 
828
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
829
  if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
830
- draftSnapshot = proxySnapshot;
831
- draftSnapshot.isIntermediate = true;
 
 
832
  sideLoaded = proxyLoaded;
833
  fallbackProxyIsUsed = true;
834
  }
835
  }
836
 
837
- if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
838
- yield draftSnapshot;
839
- }
840
-
841
  if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
842
  this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
843
  crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
 
793
  if (!sideLoaded.file) {
794
  throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
795
  }
796
+ const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(
797
  urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
798
  ).catch((err) => {
799
  if (err instanceof ApplicationError) {
 
809
  let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
810
  draftSnapshot.title ??= analyzed.title;
811
  draftSnapshot.isIntermediate = true;
812
+ if (crawlerOpts?.browserIsNotRequired()) {
813
+ yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
814
+ }
815
  let fallbackProxyIsUsed = false;
816
  if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
817
  (analyzed.tokens < 42 || sideLoaded.status !== 200)
 
828
  }
829
  return Promise.reject(err);
830
  });
831
+ if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
832
+ }
833
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
834
  if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
835
+ proxySnapshot.isIntermediate = true;
836
+ if (crawlerOpts?.browserIsNotRequired()) {
837
+ yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
838
+ }
839
  sideLoaded = proxyLoaded;
840
  fallbackProxyIsUsed = true;
841
  }
842
  }
843
 
 
 
 
 
844
  if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
845
  this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
846
  crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
src/dto/crawler-options.ts CHANGED
@@ -25,6 +25,7 @@ export enum ENGINE_TYPE {
25
 
26
  export enum RESPOND_TIMING {
27
  HTML = 'html',
 
28
  MUTATION_IDLE = 'mutation-idle',
29
  RESOURCE_IDLE = 'resource-idle',
30
  MEDIA_IDLE = 'media-idle',
@@ -222,11 +223,12 @@ class Viewport extends AutoCastable {
222
  },
223
  'X-Respond-Timing': {
224
  description: `Explicitly specify the respond timing. One of the following:\n\n` +
225
- `- html: unrendered HTML is enough to return\n` +
 
226
  `- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
227
- `- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` +
228
- `- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n` +
229
- `- network-idle: wait for full load of webpage, as usual.\n\n`,
230
  in: 'header',
231
  schema: { type: 'string' }
232
  },
@@ -600,7 +602,7 @@ export class CrawlerOptions extends AutoCastable {
600
  if (this.respondTiming) {
601
  return this.respondTiming;
602
  }
603
- if (this.timeout) {
604
  return RESPOND_TIMING.NETWORK_IDLE;
605
  }
606
  if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
@@ -636,6 +638,9 @@ export class CrawlerOptions extends AutoCastable {
636
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
637
  return false;
638
  }
 
 
 
639
  if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
640
  return true;
641
  }
@@ -677,7 +682,7 @@ export class CrawlerOptions extends AutoCastable {
677
  }
678
 
679
  browserIsNotRequired() {
680
- if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) {
681
  return false;
682
  }
683
  if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
 
25
 
26
  export enum RESPOND_TIMING {
27
  HTML = 'html',
28
+ VISIBLE_CONTENT = 'visible-content',
29
  MUTATION_IDLE = 'mutation-idle',
30
  RESOURCE_IDLE = 'resource-idle',
31
  MEDIA_IDLE = 'media-idle',
 
223
  },
224
  'X-Respond-Timing': {
225
  description: `Explicitly specify the respond timing. One of the following:\n\n` +
226
+ `- html: directly return unrendered HTML\n` +
227
+ `- visible-content: return immediately when any content becomes available\n` +
228
  `- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
229
+ `- resource-idle: wait for no additional resources that would affect page logic and content has SUCCEEDED loading in 0.5s\n` +
230
+ `- media-idle: wait for no additional resources, including media resources, has SUCCEEDED loading in 0.5s\n` +
231
+ `- network-idle: wait for full load of webpage, also known as networkidle0.\n\n`,
232
  in: 'header',
233
  schema: { type: 'string' }
234
  },
 
602
  if (this.respondTiming) {
603
  return this.respondTiming;
604
  }
605
+ if (this.timeout && this.timeout >= 20) {
606
  return RESPOND_TIMING.NETWORK_IDLE;
607
  }
608
  if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
 
638
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
639
  return false;
640
  }
641
+ if (presumedTiming === RESPOND_TIMING.VISIBLE_CONTENT && snapshot.parsed?.content) {
642
+ return true;
643
+ }
644
  if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
645
  return true;
646
  }
 
682
  }
683
 
684
  browserIsNotRequired() {
685
+ if (this.respondTiming && ![RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(this.respondTiming)) {
686
  return false;
687
  }
688
  if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {