mapleeit commited on
Commit
6900e02
·
unverified ·
1 Parent(s): 9a514cd

feat: allow passing pdf without url param (#111)

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -625,9 +625,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
625
  ) {
626
  const uid = await auth.solveUID();
627
  let chargeAmount = 0;
628
- const noSlashURL = ctx.req.url.slice(1);
629
  const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
630
- if (!noSlashURL && !crawlerOptions.url) {
 
 
631
  const latestUser = uid ? await auth.assertUser() : undefined;
632
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
633
  return this.getIndex(latestUser);
@@ -691,48 +692,20 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
691
  });
692
  }
693
 
694
- let urlToCrawl;
695
- const normalizeUrl = (await pNormalizeUrl).default;
696
- try {
697
- urlToCrawl = new URL(
698
- normalizeUrl(
699
- (crawlerOptions.url || noSlashURL).trim(),
700
- {
701
- stripWWW: false,
702
- removeTrailingSlash: false,
703
- removeSingleSlash: false,
704
- sortQueryParameters: false,
705
- }
706
- )
707
- );
708
- } catch (err) {
709
- throw new ParamValidationError({
710
- message: `${err}`,
711
- path: 'url'
712
- });
713
- }
714
- if (urlToCrawl.protocol !== 'http:' && urlToCrawl.protocol !== 'https:') {
715
- throw new ParamValidationError({
716
- message: `Invalid protocol ${urlToCrawl.protocol}`,
717
- path: 'url'
718
- });
719
- }
720
-
721
  if (!uid) {
722
- if (urlToCrawl.protocol === 'http:' && (!urlToCrawl.pathname || urlToCrawl.pathname === '/') &&
723
  crawlerOptions.respondWith !== 'default') {
724
  throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
725
  }
726
  const blockade = (await DomainBlockade.fromFirestoreQuery(
727
  DomainBlockade.COLLECTION
728
- .where('domain', '==', urlToCrawl.hostname.toLowerCase())
729
  .where('expireAt', '>=', new Date())
730
  .limit(1)
731
  ))[0];
732
  if (blockade) {
733
- throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
734
  }
735
-
736
  }
737
  const crawlOpts = this.configure(crawlerOptions);
738
 
@@ -742,12 +715,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
742
  rpcReflect.return(sseStream);
743
 
744
  try {
745
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
746
  if (!scrapped) {
747
  continue;
748
  }
749
 
750
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
751
  chargeAmount = this.assignChargeAmount(formatted);
752
  sseStream.write({
753
  event: 'data',
@@ -758,7 +731,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
758
  }
759
  }
760
  } catch (err: any) {
761
- this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
762
  sseStream.write({
763
  event: 'error',
764
  data: marshalErrorLike(err),
@@ -772,13 +745,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
772
 
773
  let lastScrapped;
774
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
775
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
776
  lastScrapped = scrapped;
777
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
778
  continue;
779
  }
780
 
781
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
782
  chargeAmount = this.assignChargeAmount(formatted);
783
 
784
  if (crawlerOptions.timeout === undefined) {
@@ -791,22 +764,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
791
  }
792
 
793
  if (!lastScrapped) {
794
- throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
795
  }
796
 
797
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
798
  chargeAmount = this.assignChargeAmount(formatted);
799
 
800
  return formatted;
801
  }
802
 
803
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
804
  lastScrapped = scrapped;
805
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
806
  continue;
807
  }
808
 
809
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
810
  chargeAmount = this.assignChargeAmount(formatted);
811
 
812
  if (crawlerOptions.timeout === undefined) {
@@ -828,10 +801,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
828
  }
829
 
830
  if (!lastScrapped) {
831
- throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
832
  }
833
 
834
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
835
  chargeAmount = this.assignChargeAmount(formatted);
836
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
837
 
@@ -849,6 +822,51 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
849
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
850
  }
851
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
852
  getUrlDigest(urlToCrawl: URL) {
853
  const normalizedURL = new URL(urlToCrawl);
854
  if (!normalizedURL.hash.startsWith('#/')) {
 
625
  ) {
626
  const uid = await auth.solveUID();
627
  let chargeAmount = 0;
 
628
  const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
629
+
630
+ const targetUrl = await this.getTargetUrl(ctx.req.url, crawlerOptions);
631
+ if (!targetUrl) {
632
  const latestUser = uid ? await auth.assertUser() : undefined;
633
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
634
  return this.getIndex(latestUser);
 
692
  });
693
  }
694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  if (!uid) {
696
+ if (targetUrl.protocol === 'http:' && (!targetUrl.pathname || targetUrl.pathname === '/') &&
697
  crawlerOptions.respondWith !== 'default') {
698
  throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
699
  }
700
  const blockade = (await DomainBlockade.fromFirestoreQuery(
701
  DomainBlockade.COLLECTION
702
+ .where('domain', '==', targetUrl.hostname.toLowerCase())
703
  .where('expireAt', '>=', new Date())
704
  .limit(1)
705
  ))[0];
706
  if (blockade) {
707
+ throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
708
  }
 
709
  }
710
  const crawlOpts = this.configure(crawlerOptions);
711
 
 
715
  rpcReflect.return(sseStream);
716
 
717
  try {
718
+ for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
719
  if (!scrapped) {
720
  continue;
721
  }
722
 
723
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
724
  chargeAmount = this.assignChargeAmount(formatted);
725
  sseStream.write({
726
  event: 'data',
 
731
  }
732
  }
733
  } catch (err: any) {
734
+ this.logger.error(`Failed to crawl ${targetUrl}`, { err: marshalErrorLike(err) });
735
  sseStream.write({
736
  event: 'error',
737
  data: marshalErrorLike(err),
 
745
 
746
  let lastScrapped;
747
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
748
+ for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
749
  lastScrapped = scrapped;
750
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
751
  continue;
752
  }
753
 
754
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
755
  chargeAmount = this.assignChargeAmount(formatted);
756
 
757
  if (crawlerOptions.timeout === undefined) {
 
764
  }
765
 
766
  if (!lastScrapped) {
767
+ throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
768
  }
769
 
770
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
771
  chargeAmount = this.assignChargeAmount(formatted);
772
 
773
  return formatted;
774
  }
775
 
776
+ for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
777
  lastScrapped = scrapped;
778
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
779
  continue;
780
  }
781
 
782
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
783
  chargeAmount = this.assignChargeAmount(formatted);
784
 
785
  if (crawlerOptions.timeout === undefined) {
 
801
  }
802
 
803
  if (!lastScrapped) {
804
+ throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
805
  }
806
 
807
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
808
  chargeAmount = this.assignChargeAmount(formatted);
809
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
810
 
 
822
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
823
  }
824
 
825
+ async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
826
+ let url: string;
827
+
828
+ const targetUrlFromGet = originPath.slice(1);
829
+ if (crawlerOptions.pdf) {
830
+ url = `file://pdf.${md5Hasher.hash(crawlerOptions.pdf)}`;
831
+ } else if (targetUrlFromGet) {
832
+ url = targetUrlFromGet.trim();
833
+ } else if (crawlerOptions.url) {
834
+ url = crawlerOptions.url.trim();
835
+ } else {
836
+ return null;
837
+ }
838
+
839
+ let result: URL;
840
+ const normalizeUrl = (await pNormalizeUrl).default;
841
+ try {
842
+ result = new URL(
843
+ normalizeUrl(
844
+ url,
845
+ {
846
+ stripWWW: false,
847
+ removeTrailingSlash: false,
848
+ removeSingleSlash: false,
849
+ sortQueryParameters: false,
850
+ }
851
+ )
852
+ );
853
+ } catch (err) {
854
+ throw new ParamValidationError({
855
+ message: `${err}`,
856
+ path: 'url'
857
+ });
858
+ }
859
+
860
+ if (!['http:', 'https:', 'file:'].includes(result.protocol)) {
861
+ throw new ParamValidationError({
862
+ message: `Invalid protocol ${result.protocol}`,
863
+ path: 'url'
864
+ });
865
+ }
866
+
867
+ return result;
868
+ }
869
+
870
  getUrlDigest(urlToCrawl: URL) {
871
  const normalizedURL = new URL(urlToCrawl);
872
  if (!normalizedURL.hash.startsWith('#/')) {