nomagick commited on
Commit
c36aa73
·
unverified ·
1 Parent(s): e27bcac

fix: target selector

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -276,7 +276,7 @@ export class CrawlerHost extends RPCHost {
276
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
277
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
278
  lastScrapped = scrapped;
279
- if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
280
  continue;
281
  }
282
 
@@ -287,12 +287,15 @@ export class CrawlerHost extends RPCHost {
287
  return formatted;
288
  }
289
 
290
- if (chargeAmount && scrapped.pdfs?.length) {
291
  return formatted;
292
  }
293
  }
294
 
295
  if (!lastScrapped) {
 
 
 
296
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
297
  }
298
 
@@ -304,7 +307,7 @@ export class CrawlerHost extends RPCHost {
304
 
305
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
306
  lastScrapped = scrapped;
307
- if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
308
  continue;
309
  }
310
 
@@ -330,6 +333,9 @@ export class CrawlerHost extends RPCHost {
330
  }
331
 
332
  if (!lastScrapped) {
 
 
 
333
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
334
  }
335
 
 
276
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
277
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
278
  lastScrapped = scrapped;
279
+ if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
280
  continue;
281
  }
282
 
 
287
  return formatted;
288
  }
289
 
290
+ if (chargeAmount && scrapped?.pdfs?.length) {
291
  return formatted;
292
  }
293
  }
294
 
295
  if (!lastScrapped) {
296
+ if (crawlOpts.targetSelector) {
297
+ throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
298
+ }
299
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
300
  }
301
 
 
307
 
308
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
309
  lastScrapped = scrapped;
310
+ if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
311
  continue;
312
  }
313
 
 
333
  }
334
 
335
  if (!lastScrapped) {
336
+ if (crawlOpts.targetSelector) {
337
+ throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
338
+ }
339
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
340
  }
341
 
backend/functions/src/services/jsdom.ts CHANGED
@@ -78,7 +78,9 @@ export class JSDomControl extends AsyncService {
78
  jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
79
  }
80
 
 
81
  if (Array.isArray(options?.targetSelector)) {
 
82
  for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
83
  x.forEach((el) => {
84
  if (!allNodes.includes(el)) {
@@ -87,6 +89,7 @@ export class JSDomControl extends AsyncService {
87
  });
88
  }
89
  } else if (options?.targetSelector) {
 
90
  jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
91
  if (!allNodes.includes(el)) {
92
  allNodes.push(el);
@@ -97,6 +100,11 @@ export class JSDomControl extends AsyncService {
97
  }
98
 
99
  if (!allNodes.length) {
 
 
 
 
 
100
  return snapshot;
101
  }
102
  const textChunks: string[] = [];
 
78
  jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
79
  }
80
 
81
+ let bewareTargetContentDoesNotExist = false;
82
  if (Array.isArray(options?.targetSelector)) {
83
+ bewareTargetContentDoesNotExist = true;
84
  for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
85
  x.forEach((el) => {
86
  if (!allNodes.includes(el)) {
 
89
  });
90
  }
91
  } else if (options?.targetSelector) {
92
+ bewareTargetContentDoesNotExist = true;
93
  jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
94
  if (!allNodes.includes(el)) {
95
  allNodes.push(el);
 
100
  }
101
 
102
  if (!allNodes.length) {
103
+
104
+ if (bewareTargetContentDoesNotExist) {
105
+ return undefined;
106
+ }
107
+
108
  return snapshot;
109
  }
110
  const textChunks: string[] = [];
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2
 
1
+ Subproject commit 4532694d769f75aabffa465565d6427a544c0d6a