nomagick commited on
Commit
1c4b64f
·
unverified ·
1 Parent(s): 78ea13b

feat: bring your own html

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -686,7 +686,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
686
  rpcReflect.return(sseStream);
687
 
688
  try {
689
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
690
  if (!scrapped) {
691
  continue;
692
  }
@@ -713,7 +713,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
713
 
714
  let lastScrapped;
715
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
716
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
717
  lastScrapped = scrapped;
718
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
719
  continue;
@@ -737,7 +737,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
737
  return formatted;
738
  }
739
 
740
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
741
  lastScrapped = scrapped;
742
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
743
  continue;
@@ -880,8 +880,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
880
  return r;
881
  }
882
 
883
- async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
 
 
 
 
 
 
 
 
 
 
 
 
884
  let cache;
 
 
885
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
886
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
887
  }
@@ -934,8 +948,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
934
  }
935
 
936
 
937
- async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
938
- const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
939
 
940
  const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
941
 
 
686
  rpcReflect.return(sseStream);
687
 
688
  try {
689
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
690
  if (!scrapped) {
691
  continue;
692
  }
 
713
 
714
  let lastScrapped;
715
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
716
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
717
  lastScrapped = scrapped;
718
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
719
  continue;
 
737
  return formatted;
738
  }
739
 
740
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
741
  lastScrapped = scrapped;
742
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
743
  continue;
 
880
  return r;
881
  }
882
 
883
+ async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
884
+ if (crawlerOpts?.html) {
885
+ const fakeSnapshot = {
886
+ href: urlToCrawl.toString(),
887
+ html: crawlerOpts.html,
888
+ title: '',
889
+ text: '',
890
+ } as PageSnapshot;
891
+
892
+ yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts);
893
+
894
+ return;
895
+ }
896
  let cache;
897
+
898
+ const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs;
899
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
900
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
901
  }
 
948
  }
949
 
950
 
951
+ async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
952
+ const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));
953
 
954
  const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
955
 
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -142,6 +142,8 @@ export class SearcherHost extends RPCHost {
142
  });
143
  }
144
 
 
 
145
  const crawlOpts = this.crawler.configure(crawlerOptions);
146
  const cookies: CookieParam[] = [];
147
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
@@ -171,7 +173,7 @@ export class SearcherHost extends RPCHost {
171
  }
172
 
173
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
174
- crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
175
  );
176
 
177
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@@ -308,13 +310,13 @@ export class SearcherHost extends RPCHost {
308
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
309
  searchResults?: WebSearchResult[],
310
  options?: ExtraScrappingOptions,
311
- pageCacheTolerance?: number
312
  ) {
313
  if (!searchResults) {
314
  return;
315
  }
316
  const urls = searchResults.map((x) => new URL(x.url));
317
- for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
318
  const mapped = scrapped.map((x, i) => {
319
  const upstreamSearchResult = searchResults[i];
320
  if (!x || (!x.parsed && mode !== 'markdown')) {
 
142
  });
143
  }
144
 
145
+ delete crawlerOptions.html;
146
+
147
  const crawlOpts = this.crawler.configure(crawlerOptions);
148
  const cookies: CookieParam[] = [];
149
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
 
173
  }
174
 
175
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
176
+ { ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs }
177
  );
178
 
179
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
 
310
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
311
  searchResults?: WebSearchResult[],
312
  options?: ExtraScrappingOptions,
313
+ crawlerOptions?: CrawlerOptions,
314
  ) {
315
  if (!searchResults) {
316
  return;
317
  }
318
  const urls = searchResults.map((x) => new URL(x.url));
319
+ for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
320
  const mapped = scrapped.map((x, i) => {
321
  const upstreamSearchResult = searchResults[i];
322
  if (!x || (!x.parsed && mode !== 'markdown')) {
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -119,6 +119,9 @@ export class CrawlerOptions extends AutoCastable {
119
  @Prop()
120
  url?: string;
121
 
 
 
 
122
  @Prop({
123
  default: 'default',
124
  })
 
119
  @Prop()
120
  url?: string;
121
 
122
+ @Prop()
123
+ html?: string;
124
+
125
  @Prop({
126
  default: 'default',
127
  })
backend/functions/src/services/puppeteer.ts CHANGED
@@ -653,7 +653,7 @@ document.addEventListener('load', handlePageLoad);
653
  targetSelector?: string | string[];
654
  removeSelector?: string | string[];
655
  }): PageSnapshot | undefined {
656
- if (!options?.targetSelector && !options?.removeSelector) {
657
  return snapshot;
658
  }
659
  if (!snapshot?.html) {
@@ -663,15 +663,15 @@ document.addEventListener('load', handlePageLoad);
663
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
664
  const allNodes: Node[] = [];
665
 
666
- if (Array.isArray(options.removeSelector)) {
667
  for (const rl of options.removeSelector) {
668
  jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
669
  }
670
- } else if (options.removeSelector) {
671
  jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
672
  }
673
 
674
- if (Array.isArray(options.targetSelector)) {
675
  for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
676
  x.forEach((el) => {
677
  if (!allNodes.includes(el)) {
@@ -679,7 +679,7 @@ document.addEventListener('load', handlePageLoad);
679
  }
680
  });
681
  }
682
- } else if (options.targetSelector) {
683
  jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
684
  if (!allNodes.includes(el)) {
685
  allNodes.push(el);
@@ -738,6 +738,7 @@ document.addEventListener('load', handlePageLoad);
738
 
739
  const r = {
740
  ...snapshot,
 
741
  parsed,
742
  html: rootDoc.documentElement.outerHTML,
743
  text: cleanedText,
 
653
  targetSelector?: string | string[];
654
  removeSelector?: string | string[];
655
  }): PageSnapshot | undefined {
656
+ if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
657
  return snapshot;
658
  }
659
  if (!snapshot?.html) {
 
663
  const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
664
  const allNodes: Node[] = [];
665
 
666
+ if (Array.isArray(options?.removeSelector)) {
667
  for (const rl of options.removeSelector) {
668
  jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
669
  }
670
+ } else if (options?.removeSelector) {
671
  jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
672
  }
673
 
674
+ if (Array.isArray(options?.targetSelector)) {
675
  for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
676
  x.forEach((el) => {
677
  if (!allNodes.includes(el)) {
 
679
  }
680
  });
681
  }
682
+ } else if (options?.targetSelector) {
683
  jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
684
  if (!allNodes.includes(el)) {
685
  allNodes.push(el);
 
738
 
739
  const r = {
740
  ...snapshot,
741
+ title: snapshot.title || jsdom.window.document.title,
742
  parsed,
743
  html: rootDoc.documentElement.outerHTML,
744
  text: cleanedText,