Spaces:
Build error
Build error
feat: allow passing pdf without url param (#111)
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -625,9 +625,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 625 |
) {
|
| 626 |
const uid = await auth.solveUID();
|
| 627 |
let chargeAmount = 0;
|
| 628 |
-
const noSlashURL = ctx.req.url.slice(1);
|
| 629 |
const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
|
| 630 |
-
|
|
|
|
|
|
|
| 631 |
const latestUser = uid ? await auth.assertUser() : undefined;
|
| 632 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 633 |
return this.getIndex(latestUser);
|
|
@@ -691,48 +692,20 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 691 |
});
|
| 692 |
}
|
| 693 |
|
| 694 |
-
let urlToCrawl;
|
| 695 |
-
const normalizeUrl = (await pNormalizeUrl).default;
|
| 696 |
-
try {
|
| 697 |
-
urlToCrawl = new URL(
|
| 698 |
-
normalizeUrl(
|
| 699 |
-
(crawlerOptions.url || noSlashURL).trim(),
|
| 700 |
-
{
|
| 701 |
-
stripWWW: false,
|
| 702 |
-
removeTrailingSlash: false,
|
| 703 |
-
removeSingleSlash: false,
|
| 704 |
-
sortQueryParameters: false,
|
| 705 |
-
}
|
| 706 |
-
)
|
| 707 |
-
);
|
| 708 |
-
} catch (err) {
|
| 709 |
-
throw new ParamValidationError({
|
| 710 |
-
message: `${err}`,
|
| 711 |
-
path: 'url'
|
| 712 |
-
});
|
| 713 |
-
}
|
| 714 |
-
if (urlToCrawl.protocol !== 'http:' && urlToCrawl.protocol !== 'https:') {
|
| 715 |
-
throw new ParamValidationError({
|
| 716 |
-
message: `Invalid protocol ${urlToCrawl.protocol}`,
|
| 717 |
-
path: 'url'
|
| 718 |
-
});
|
| 719 |
-
}
|
| 720 |
-
|
| 721 |
if (!uid) {
|
| 722 |
-
if (
|
| 723 |
crawlerOptions.respondWith !== 'default') {
|
| 724 |
throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
|
| 725 |
}
|
| 726 |
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
| 727 |
DomainBlockade.COLLECTION
|
| 728 |
-
.where('domain', '==',
|
| 729 |
.where('expireAt', '>=', new Date())
|
| 730 |
.limit(1)
|
| 731 |
))[0];
|
| 732 |
if (blockade) {
|
| 733 |
-
throw new SecurityCompromiseError(`Domain ${
|
| 734 |
}
|
| 735 |
-
|
| 736 |
}
|
| 737 |
const crawlOpts = this.configure(crawlerOptions);
|
| 738 |
|
|
@@ -742,12 +715,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 742 |
rpcReflect.return(sseStream);
|
| 743 |
|
| 744 |
try {
|
| 745 |
-
for await (const scrapped of this.cachedScrap(
|
| 746 |
if (!scrapped) {
|
| 747 |
continue;
|
| 748 |
}
|
| 749 |
|
| 750 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped,
|
| 751 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 752 |
sseStream.write({
|
| 753 |
event: 'data',
|
|
@@ -758,7 +731,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 758 |
}
|
| 759 |
}
|
| 760 |
} catch (err: any) {
|
| 761 |
-
this.logger.error(`Failed to crawl ${
|
| 762 |
sseStream.write({
|
| 763 |
event: 'error',
|
| 764 |
data: marshalErrorLike(err),
|
|
@@ -772,13 +745,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 772 |
|
| 773 |
let lastScrapped;
|
| 774 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 775 |
-
for await (const scrapped of this.cachedScrap(
|
| 776 |
lastScrapped = scrapped;
|
| 777 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 778 |
continue;
|
| 779 |
}
|
| 780 |
|
| 781 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped,
|
| 782 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 783 |
|
| 784 |
if (crawlerOptions.timeout === undefined) {
|
|
@@ -791,22 +764,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 791 |
}
|
| 792 |
|
| 793 |
if (!lastScrapped) {
|
| 794 |
-
throw new AssertionFailureError(`No content available for URL ${
|
| 795 |
}
|
| 796 |
|
| 797 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped,
|
| 798 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 799 |
|
| 800 |
return formatted;
|
| 801 |
}
|
| 802 |
|
| 803 |
-
for await (const scrapped of this.cachedScrap(
|
| 804 |
lastScrapped = scrapped;
|
| 805 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 806 |
continue;
|
| 807 |
}
|
| 808 |
|
| 809 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped,
|
| 810 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 811 |
|
| 812 |
if (crawlerOptions.timeout === undefined) {
|
|
@@ -828,10 +801,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 828 |
}
|
| 829 |
|
| 830 |
if (!lastScrapped) {
|
| 831 |
-
throw new AssertionFailureError(`No content available for URL ${
|
| 832 |
}
|
| 833 |
|
| 834 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped,
|
| 835 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 836 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 837 |
|
|
@@ -849,6 +822,51 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 849 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 850 |
}
|
| 851 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
getUrlDigest(urlToCrawl: URL) {
|
| 853 |
const normalizedURL = new URL(urlToCrawl);
|
| 854 |
if (!normalizedURL.hash.startsWith('#/')) {
|
|
|
|
| 625 |
) {
|
| 626 |
const uid = await auth.solveUID();
|
| 627 |
let chargeAmount = 0;
|
|
|
|
| 628 |
const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
|
| 629 |
+
|
| 630 |
+
const targetUrl = await this.getTargetUrl(ctx.req.url, crawlerOptions);
|
| 631 |
+
if (!targetUrl) {
|
| 632 |
const latestUser = uid ? await auth.assertUser() : undefined;
|
| 633 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 634 |
return this.getIndex(latestUser);
|
|
|
|
| 692 |
});
|
| 693 |
}
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
if (!uid) {
|
| 696 |
+
if (targetUrl.protocol === 'http:' && (!targetUrl.pathname || targetUrl.pathname === '/') &&
|
| 697 |
crawlerOptions.respondWith !== 'default') {
|
| 698 |
throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
|
| 699 |
}
|
| 700 |
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
| 701 |
DomainBlockade.COLLECTION
|
| 702 |
+
.where('domain', '==', targetUrl.hostname.toLowerCase())
|
| 703 |
.where('expireAt', '>=', new Date())
|
| 704 |
.limit(1)
|
| 705 |
))[0];
|
| 706 |
if (blockade) {
|
| 707 |
+
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 708 |
}
|
|
|
|
| 709 |
}
|
| 710 |
const crawlOpts = this.configure(crawlerOptions);
|
| 711 |
|
|
|
|
| 715 |
rpcReflect.return(sseStream);
|
| 716 |
|
| 717 |
try {
|
| 718 |
+
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
| 719 |
if (!scrapped) {
|
| 720 |
continue;
|
| 721 |
}
|
| 722 |
|
| 723 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
| 724 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 725 |
sseStream.write({
|
| 726 |
event: 'data',
|
|
|
|
| 731 |
}
|
| 732 |
}
|
| 733 |
} catch (err: any) {
|
| 734 |
+
this.logger.error(`Failed to crawl ${targetUrl}`, { err: marshalErrorLike(err) });
|
| 735 |
sseStream.write({
|
| 736 |
event: 'error',
|
| 737 |
data: marshalErrorLike(err),
|
|
|
|
| 745 |
|
| 746 |
let lastScrapped;
|
| 747 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 748 |
+
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
| 749 |
lastScrapped = scrapped;
|
| 750 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 751 |
continue;
|
| 752 |
}
|
| 753 |
|
| 754 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
| 755 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 756 |
|
| 757 |
if (crawlerOptions.timeout === undefined) {
|
|
|
|
| 764 |
}
|
| 765 |
|
| 766 |
if (!lastScrapped) {
|
| 767 |
+
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 768 |
}
|
| 769 |
|
| 770 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
| 771 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 772 |
|
| 773 |
return formatted;
|
| 774 |
}
|
| 775 |
|
| 776 |
+
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
| 777 |
lastScrapped = scrapped;
|
| 778 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 779 |
continue;
|
| 780 |
}
|
| 781 |
|
| 782 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
| 783 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 784 |
|
| 785 |
if (crawlerOptions.timeout === undefined) {
|
|
|
|
| 801 |
}
|
| 802 |
|
| 803 |
if (!lastScrapped) {
|
| 804 |
+
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 805 |
}
|
| 806 |
|
| 807 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
| 808 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 809 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 810 |
|
|
|
|
| 822 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 823 |
}
|
| 824 |
|
| 825 |
+
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
| 826 |
+
let url: string;
|
| 827 |
+
|
| 828 |
+
const targetUrlFromGet = originPath.slice(1);
|
| 829 |
+
if (crawlerOptions.pdf) {
|
| 830 |
+
url = `file://pdf.${md5Hasher.hash(crawlerOptions.pdf)}`;
|
| 831 |
+
} else if (targetUrlFromGet) {
|
| 832 |
+
url = targetUrlFromGet.trim();
|
| 833 |
+
} else if (crawlerOptions.url) {
|
| 834 |
+
url = crawlerOptions.url.trim();
|
| 835 |
+
} else {
|
| 836 |
+
return null;
|
| 837 |
+
}
|
| 838 |
+
|
| 839 |
+
let result: URL;
|
| 840 |
+
const normalizeUrl = (await pNormalizeUrl).default;
|
| 841 |
+
try {
|
| 842 |
+
result = new URL(
|
| 843 |
+
normalizeUrl(
|
| 844 |
+
url,
|
| 845 |
+
{
|
| 846 |
+
stripWWW: false,
|
| 847 |
+
removeTrailingSlash: false,
|
| 848 |
+
removeSingleSlash: false,
|
| 849 |
+
sortQueryParameters: false,
|
| 850 |
+
}
|
| 851 |
+
)
|
| 852 |
+
);
|
| 853 |
+
} catch (err) {
|
| 854 |
+
throw new ParamValidationError({
|
| 855 |
+
message: `${err}`,
|
| 856 |
+
path: 'url'
|
| 857 |
+
});
|
| 858 |
+
}
|
| 859 |
+
|
| 860 |
+
if (!['http:', 'https:', 'file:'].includes(result.protocol)) {
|
| 861 |
+
throw new ParamValidationError({
|
| 862 |
+
message: `Invalid protocol ${result.protocol}`,
|
| 863 |
+
path: 'url'
|
| 864 |
+
});
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
+
return result;
|
| 868 |
+
}
|
| 869 |
+
|
| 870 |
getUrlDigest(urlToCrawl: URL) {
|
| 871 |
const normalizedURL = new URL(urlToCrawl);
|
| 872 |
if (!normalizedURL.hash.startsWith('#/')) {
|