Spaces:
Build error
Build error
fix: somehow side-loading chromewebstore would 100% crash the browser
Browse files- src/api/crawler.ts +58 -48
src/api/crawler.ts
CHANGED
|
@@ -735,62 +735,64 @@ export class CrawlerHost extends RPCHost {
|
|
| 735 |
return;
|
| 736 |
}
|
| 737 |
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
this.
|
| 744 |
-
|
| 745 |
-
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
| 746 |
-
return Promise.reject(err);
|
| 747 |
-
}
|
| 748 |
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 753 |
-
}
|
| 754 |
-
let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
| 755 |
-
if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
|
| 756 |
-
yield draftSnapshot;
|
| 757 |
-
return;
|
| 758 |
-
}
|
| 759 |
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
| 764 |
-
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
| 765 |
-
) {
|
| 766 |
-
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
| 767 |
-
if (!proxyLoaded.file) {
|
| 768 |
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 769 |
}
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
sideLoaded = proxyLoaded;
|
| 775 |
-
fallbackProxyIsUsed = true;
|
| 776 |
}
|
| 777 |
-
}
|
| 778 |
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 788 |
}
|
| 789 |
-
}
|
| 790 |
-
} catch (err: any) {
|
| 791 |
-
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
| 792 |
-
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
| 793 |
-
throw err;
|
| 794 |
}
|
| 795 |
}
|
| 796 |
|
|
@@ -1192,4 +1194,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 1192 |
|
| 1193 |
return { ...r, proxy };
|
| 1194 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1195 |
}
|
|
|
|
| 735 |
return;
|
| 736 |
}
|
| 737 |
|
| 738 |
+
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
|
| 739 |
+
try {
|
| 740 |
+
const altOpts = { ...crawlOpts };
|
| 741 |
+
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
| 742 |
+
await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
|
| 743 |
+
await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
|
| 744 |
+
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
|
|
|
|
|
|
|
|
|
| 745 |
|
| 746 |
+
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
| 747 |
+
return Promise.reject(err);
|
| 748 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
|
| 750 |
+
return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
| 751 |
+
});
|
| 752 |
+
if (!sideLoaded.file) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 754 |
}
|
| 755 |
+
let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
| 756 |
+
if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
|
| 757 |
+
yield draftSnapshot;
|
| 758 |
+
return;
|
|
|
|
|
|
|
| 759 |
}
|
|
|
|
| 760 |
|
| 761 |
+
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 762 |
+
draftSnapshot.title ??= analyzed.title;
|
| 763 |
+
let fallbackProxyIsUsed = false;
|
| 764 |
+
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
| 765 |
+
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
| 766 |
+
) {
|
| 767 |
+
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
| 768 |
+
if (!proxyLoaded.file) {
|
| 769 |
+
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 770 |
+
}
|
| 771 |
+
const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName);
|
| 772 |
+
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 773 |
+
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 774 |
+
draftSnapshot = proxySnapshot;
|
| 775 |
+
sideLoaded = proxyLoaded;
|
| 776 |
+
fallbackProxyIsUsed = true;
|
| 777 |
+
}
|
| 778 |
+
}
|
| 779 |
|
| 780 |
+
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
| 781 |
+
yield draftSnapshot;
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
|
| 785 |
+
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
|
| 786 |
+
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
|
| 787 |
+
if (fallbackProxyIsUsed) {
|
| 788 |
+
this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
|
| 789 |
+
}
|
| 790 |
+
}
|
| 791 |
+
} catch (err: any) {
|
| 792 |
+
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
| 793 |
+
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
| 794 |
+
throw err;
|
| 795 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
}
|
| 797 |
}
|
| 798 |
|
|
|
|
| 1194 |
|
| 1195 |
return { ...r, proxy };
|
| 1196 |
}
|
| 1197 |
+
|
| 1198 |
+
knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
|
| 1199 |
+
if (url.hostname === 'chromewebstore.google.com') {
|
| 1200 |
+
return true;
|
| 1201 |
+
}
|
| 1202 |
+
|
| 1203 |
+
return false;
|
| 1204 |
+
}
|
| 1205 |
}
|