nomagick commited on
Commit
8456fce
·
unverified ·
1 Parent(s): 440ff4d

fix: somehow side-loading chromewebstore would 100% crash the browser

Browse files
Files changed (1) hide show
  1. src/api/crawler.ts +58 -48
src/api/crawler.ts CHANGED
@@ -735,62 +735,64 @@ export class CrawlerHost extends RPCHost {
735
  return;
736
  }
737
 
738
- try {
739
- const altOpts = { ...crawlOpts };
740
- let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
741
- await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
742
- await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
743
- this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
744
-
745
- if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
746
- return Promise.reject(err);
747
- }
748
 
749
- return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
750
- });
751
- if (!sideLoaded.file) {
752
- throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
753
- }
754
- let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
755
- if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
756
- yield draftSnapshot;
757
- return;
758
- }
759
 
760
- let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
761
- draftSnapshot.title ??= analyzed.title;
762
- let fallbackProxyIsUsed = false;
763
- if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
764
- (analyzed.tokens < 42 || sideLoaded.status !== 200)
765
- ) {
766
- const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
767
- if (!proxyLoaded.file) {
768
  throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
769
  }
770
- const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName);
771
- analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
772
- if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
773
- draftSnapshot = proxySnapshot;
774
- sideLoaded = proxyLoaded;
775
- fallbackProxyIsUsed = true;
776
  }
777
- }
778
 
779
- if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
780
- yield draftSnapshot;
781
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
 
783
- if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
784
- this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
785
- crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
786
- if (fallbackProxyIsUsed) {
787
- this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
 
 
 
 
 
 
 
 
 
 
788
  }
789
- }
790
- } catch (err: any) {
791
- this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
792
- if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
793
- throw err;
794
  }
795
  }
796
 
@@ -1192,4 +1194,12 @@ export class CrawlerHost extends RPCHost {
1192
 
1193
  return { ...r, proxy };
1194
  }
 
 
 
 
 
 
 
 
1195
  }
 
735
  return;
736
  }
737
 
738
+ if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
739
+ try {
740
+ const altOpts = { ...crawlOpts };
741
+ let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
742
+ await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
743
+ await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
744
+ this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
 
 
 
745
 
746
+ if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
747
+ return Promise.reject(err);
748
+ }
 
 
 
 
 
 
 
749
 
750
+ return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
751
+ });
752
+ if (!sideLoaded.file) {
 
 
 
 
 
753
  throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
754
  }
755
+ let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
756
+ if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
757
+ yield draftSnapshot;
758
+ return;
 
 
759
  }
 
760
 
761
+ let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
762
+ draftSnapshot.title ??= analyzed.title;
763
+ let fallbackProxyIsUsed = false;
764
+ if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
765
+ (analyzed.tokens < 42 || sideLoaded.status !== 200)
766
+ ) {
767
+ const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
768
+ if (!proxyLoaded.file) {
769
+ throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
770
+ }
771
+ const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName);
772
+ analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
773
+ if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
774
+ draftSnapshot = proxySnapshot;
775
+ sideLoaded = proxyLoaded;
776
+ fallbackProxyIsUsed = true;
777
+ }
778
+ }
779
 
780
+ if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
781
+ yield draftSnapshot;
782
+ }
783
+
784
+ if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
785
+ this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
786
+ crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
787
+ if (fallbackProxyIsUsed) {
788
+ this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
789
+ }
790
+ }
791
+ } catch (err: any) {
792
+ this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
793
+ if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
794
+ throw err;
795
  }
 
 
 
 
 
796
  }
797
  }
798
 
 
1194
 
1195
  return { ...r, proxy };
1196
  }
1197
+
1198
+ knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
1199
+ if (url.hostname === 'chromewebstore.google.com') {
1200
+ return true;
1201
+ }
1202
+
1203
+ return false;
1204
+ }
1205
  }