Spaces:
Build error
Build error
feat: bring your own html
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -686,7 +686,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 686 |
rpcReflect.return(sseStream);
|
| 687 |
|
| 688 |
try {
|
| 689 |
-
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions
|
| 690 |
if (!scrapped) {
|
| 691 |
continue;
|
| 692 |
}
|
|
@@ -713,7 +713,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 713 |
|
| 714 |
let lastScrapped;
|
| 715 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 716 |
-
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions
|
| 717 |
lastScrapped = scrapped;
|
| 718 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 719 |
continue;
|
|
@@ -737,7 +737,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 737 |
return formatted;
|
| 738 |
}
|
| 739 |
|
| 740 |
-
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions
|
| 741 |
lastScrapped = scrapped;
|
| 742 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 743 |
continue;
|
|
@@ -880,8 +880,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 880 |
return r;
|
| 881 |
}
|
| 882 |
|
| 883 |
-
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
let cache;
|
|
|
|
|
|
|
| 885 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 886 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 887 |
}
|
|
@@ -934,8 +948,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 934 |
}
|
| 935 |
|
| 936 |
|
| 937 |
-
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions,
|
| 938 |
-
const iterators = urls.map((url) => this.cachedScrap(url, options,
|
| 939 |
|
| 940 |
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
| 941 |
|
|
|
|
| 686 |
rpcReflect.return(sseStream);
|
| 687 |
|
| 688 |
try {
|
| 689 |
+
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
| 690 |
if (!scrapped) {
|
| 691 |
continue;
|
| 692 |
}
|
|
|
|
| 713 |
|
| 714 |
let lastScrapped;
|
| 715 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 716 |
+
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
| 717 |
lastScrapped = scrapped;
|
| 718 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 719 |
continue;
|
|
|
|
| 737 |
return formatted;
|
| 738 |
}
|
| 739 |
|
| 740 |
+
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
| 741 |
lastScrapped = scrapped;
|
| 742 |
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 743 |
continue;
|
|
|
|
| 880 |
return r;
|
| 881 |
}
|
| 882 |
|
| 883 |
+
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
| 884 |
+
if (crawlerOpts?.html) {
|
| 885 |
+
const fakeSnapshot = {
|
| 886 |
+
href: urlToCrawl.toString(),
|
| 887 |
+
html: crawlerOpts.html,
|
| 888 |
+
title: '',
|
| 889 |
+
text: '',
|
| 890 |
+
} as PageSnapshot;
|
| 891 |
+
|
| 892 |
+
yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
| 893 |
+
|
| 894 |
+
return;
|
| 895 |
+
}
|
| 896 |
let cache;
|
| 897 |
+
|
| 898 |
+
const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs;
|
| 899 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 900 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 901 |
}
|
|
|
|
| 948 |
}
|
| 949 |
|
| 950 |
|
| 951 |
+
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
| 952 |
+
const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));
|
| 953 |
|
| 954 |
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
| 955 |
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -142,6 +142,8 @@ export class SearcherHost extends RPCHost {
|
|
| 142 |
});
|
| 143 |
}
|
| 144 |
|
|
|
|
|
|
|
| 145 |
const crawlOpts = this.crawler.configure(crawlerOptions);
|
| 146 |
const cookies: CookieParam[] = [];
|
| 147 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
|
@@ -171,7 +173,7 @@ export class SearcherHost extends RPCHost {
|
|
| 171 |
}
|
| 172 |
|
| 173 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 174 |
-
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
| 175 |
);
|
| 176 |
|
| 177 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
@@ -308,13 +310,13 @@ export class SearcherHost extends RPCHost {
|
|
| 308 |
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
| 309 |
searchResults?: WebSearchResult[],
|
| 310 |
options?: ExtraScrappingOptions,
|
| 311 |
-
|
| 312 |
) {
|
| 313 |
if (!searchResults) {
|
| 314 |
return;
|
| 315 |
}
|
| 316 |
const urls = searchResults.map((x) => new URL(x.url));
|
| 317 |
-
for await (const scrapped of this.crawler.scrapMany(urls, options,
|
| 318 |
const mapped = scrapped.map((x, i) => {
|
| 319 |
const upstreamSearchResult = searchResults[i];
|
| 320 |
if (!x || (!x.parsed && mode !== 'markdown')) {
|
|
|
|
| 142 |
});
|
| 143 |
}
|
| 144 |
|
| 145 |
+
delete crawlerOptions.html;
|
| 146 |
+
|
| 147 |
const crawlOpts = this.crawler.configure(crawlerOptions);
|
| 148 |
const cookies: CookieParam[] = [];
|
| 149 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
|
|
|
| 173 |
}
|
| 174 |
|
| 175 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 176 |
+
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs }
|
| 177 |
);
|
| 178 |
|
| 179 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
|
|
| 310 |
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
| 311 |
searchResults?: WebSearchResult[],
|
| 312 |
options?: ExtraScrappingOptions,
|
| 313 |
+
crawlerOptions?: CrawlerOptions,
|
| 314 |
) {
|
| 315 |
if (!searchResults) {
|
| 316 |
return;
|
| 317 |
}
|
| 318 |
const urls = searchResults.map((x) => new URL(x.url));
|
| 319 |
+
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 320 |
const mapped = scrapped.map((x, i) => {
|
| 321 |
const upstreamSearchResult = searchResults[i];
|
| 322 |
if (!x || (!x.parsed && mode !== 'markdown')) {
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -119,6 +119,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 119 |
@Prop()
|
| 120 |
url?: string;
|
| 121 |
|
|
|
|
|
|
|
|
|
|
| 122 |
@Prop({
|
| 123 |
default: 'default',
|
| 124 |
})
|
|
|
|
| 119 |
@Prop()
|
| 120 |
url?: string;
|
| 121 |
|
| 122 |
+
@Prop()
|
| 123 |
+
html?: string;
|
| 124 |
+
|
| 125 |
@Prop({
|
| 126 |
default: 'default',
|
| 127 |
})
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -653,7 +653,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 653 |
targetSelector?: string | string[];
|
| 654 |
removeSelector?: string | string[];
|
| 655 |
}): PageSnapshot | undefined {
|
| 656 |
-
if (!options?.targetSelector && !options?.removeSelector) {
|
| 657 |
return snapshot;
|
| 658 |
}
|
| 659 |
if (!snapshot?.html) {
|
|
@@ -663,15 +663,15 @@ document.addEventListener('load', handlePageLoad);
|
|
| 663 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 664 |
const allNodes: Node[] = [];
|
| 665 |
|
| 666 |
-
if (Array.isArray(options.removeSelector)) {
|
| 667 |
for (const rl of options.removeSelector) {
|
| 668 |
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 669 |
}
|
| 670 |
-
} else if (options.removeSelector) {
|
| 671 |
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
| 672 |
}
|
| 673 |
|
| 674 |
-
if (Array.isArray(options.targetSelector)) {
|
| 675 |
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 676 |
x.forEach((el) => {
|
| 677 |
if (!allNodes.includes(el)) {
|
|
@@ -679,7 +679,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 679 |
}
|
| 680 |
});
|
| 681 |
}
|
| 682 |
-
} else if (options.targetSelector) {
|
| 683 |
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
| 684 |
if (!allNodes.includes(el)) {
|
| 685 |
allNodes.push(el);
|
|
@@ -738,6 +738,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 738 |
|
| 739 |
const r = {
|
| 740 |
...snapshot,
|
|
|
|
| 741 |
parsed,
|
| 742 |
html: rootDoc.documentElement.outerHTML,
|
| 743 |
text: cleanedText,
|
|
|
|
| 653 |
targetSelector?: string | string[];
|
| 654 |
removeSelector?: string | string[];
|
| 655 |
}): PageSnapshot | undefined {
|
| 656 |
+
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
| 657 |
return snapshot;
|
| 658 |
}
|
| 659 |
if (!snapshot?.html) {
|
|
|
|
| 663 |
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
| 664 |
const allNodes: Node[] = [];
|
| 665 |
|
| 666 |
+
if (Array.isArray(options?.removeSelector)) {
|
| 667 |
for (const rl of options.removeSelector) {
|
| 668 |
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 669 |
}
|
| 670 |
+
} else if (options?.removeSelector) {
|
| 671 |
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
| 672 |
}
|
| 673 |
|
| 674 |
+
if (Array.isArray(options?.targetSelector)) {
|
| 675 |
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 676 |
x.forEach((el) => {
|
| 677 |
if (!allNodes.includes(el)) {
|
|
|
|
| 679 |
}
|
| 680 |
});
|
| 681 |
}
|
| 682 |
+
} else if (options?.targetSelector) {
|
| 683 |
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
| 684 |
if (!allNodes.includes(el)) {
|
| 685 |
allNodes.push(el);
|
|
|
|
| 738 |
|
| 739 |
const r = {
|
| 740 |
...snapshot,
|
| 741 |
+
title: snapshot.title || jsdom.window.document.title,
|
| 742 |
parsed,
|
| 743 |
html: rootDoc.documentElement.outerHTML,
|
| 744 |
text: cleanedText,
|