nomagick commited on
Commit
6be6051
·
unverified ·
1 Parent(s): 06f3593
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -590,33 +590,26 @@ export class CrawlerHost extends RPCHost {
590
  }
591
 
592
  async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
 
593
  if (crawlerOpts?.html) {
594
- const fakeSnapshot = {
595
  href: urlToCrawl.toString(),
596
  html: crawlerOpts.html,
597
  title: '',
598
  text: '',
599
  } as PageSnapshot;
600
-
601
- yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
602
-
603
- return;
604
  }
605
 
606
  if (crawlerOpts?.pdf) {
607
  const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
608
  const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
609
- const fakeSnapshot = {
610
  href: urlToCrawl.toString(),
611
  html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
612
  title: '',
613
  text: '',
614
  pdfs: [pdfDataUrl],
615
  } as PageSnapshot;
616
-
617
- yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
618
-
619
- return;
620
  }
621
 
622
  if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
@@ -668,6 +661,12 @@ export class CrawlerHost extends RPCHost {
668
  return;
669
  }
670
 
 
 
 
 
 
 
671
  let cache;
672
 
673
  if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
 
590
  }
591
 
592
  async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
593
+ let overrideFinalSnapshot;
594
  if (crawlerOpts?.html) {
595
+ overrideFinalSnapshot = {
596
  href: urlToCrawl.toString(),
597
  html: crawlerOpts.html,
598
  title: '',
599
  text: '',
600
  } as PageSnapshot;
 
 
 
 
601
  }
602
 
603
  if (crawlerOpts?.pdf) {
604
  const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
605
  const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
606
+ overrideFinalSnapshot = {
607
  href: urlToCrawl.toString(),
608
  html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
609
  title: '',
610
  text: '',
611
  pdfs: [pdfDataUrl],
612
  } as PageSnapshot;
 
 
 
 
613
  }
614
 
615
  if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
 
661
  return;
662
  }
663
 
664
+ if (overrideFinalSnapshot) {
665
+ yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts);
666
+
667
+ return;
668
+ }
669
+
670
  let cache;
671
 
672
  if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {