Spaces:
Build error
Build error
fix
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -590,33 +590,26 @@ export class CrawlerHost extends RPCHost {
|
|
| 590 |
}
|
| 591 |
|
| 592 |
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
|
|
|
| 593 |
if (crawlerOpts?.html) {
|
| 594 |
-
|
| 595 |
href: urlToCrawl.toString(),
|
| 596 |
html: crawlerOpts.html,
|
| 597 |
title: '',
|
| 598 |
text: '',
|
| 599 |
} as PageSnapshot;
|
| 600 |
-
|
| 601 |
-
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
| 602 |
-
|
| 603 |
-
return;
|
| 604 |
}
|
| 605 |
|
| 606 |
if (crawlerOpts?.pdf) {
|
| 607 |
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
| 608 |
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
| 609 |
-
|
| 610 |
href: urlToCrawl.toString(),
|
| 611 |
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
| 612 |
title: '',
|
| 613 |
text: '',
|
| 614 |
pdfs: [pdfDataUrl],
|
| 615 |
} as PageSnapshot;
|
| 616 |
-
|
| 617 |
-
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
| 618 |
-
|
| 619 |
-
return;
|
| 620 |
}
|
| 621 |
|
| 622 |
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
|
@@ -668,6 +661,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 668 |
return;
|
| 669 |
}
|
| 670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
let cache;
|
| 672 |
|
| 673 |
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
|
|
|
| 590 |
}
|
| 591 |
|
| 592 |
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
| 593 |
+
let overrideFinalSnapshot;
|
| 594 |
if (crawlerOpts?.html) {
|
| 595 |
+
overrideFinalSnapshot = {
|
| 596 |
href: urlToCrawl.toString(),
|
| 597 |
html: crawlerOpts.html,
|
| 598 |
title: '',
|
| 599 |
text: '',
|
| 600 |
} as PageSnapshot;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
}
|
| 602 |
|
| 603 |
if (crawlerOpts?.pdf) {
|
| 604 |
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
| 605 |
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
| 606 |
+
overrideFinalSnapshot = {
|
| 607 |
href: urlToCrawl.toString(),
|
| 608 |
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
| 609 |
title: '',
|
| 610 |
text: '',
|
| 611 |
pdfs: [pdfDataUrl],
|
| 612 |
} as PageSnapshot;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
}
|
| 614 |
|
| 615 |
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
|
|
|
| 661 |
return;
|
| 662 |
}
|
| 663 |
|
| 664 |
+
if (overrideFinalSnapshot) {
|
| 665 |
+
yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts);
|
| 666 |
+
|
| 667 |
+
return;
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
let cache;
|
| 671 |
|
| 672 |
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|