Spaces:
Build error
Build error
fix: target selector
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -276,7 +276,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 276 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 277 |
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
| 278 |
lastScrapped = scrapped;
|
| 279 |
-
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 280 |
continue;
|
| 281 |
}
|
| 282 |
|
|
@@ -287,12 +287,15 @@ export class CrawlerHost extends RPCHost {
|
|
| 287 |
return formatted;
|
| 288 |
}
|
| 289 |
|
| 290 |
-
if (chargeAmount && scrapped.pdfs?.length) {
|
| 291 |
return formatted;
|
| 292 |
}
|
| 293 |
}
|
| 294 |
|
| 295 |
if (!lastScrapped) {
|
|
|
|
|
|
|
|
|
|
| 296 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 297 |
}
|
| 298 |
|
|
@@ -304,7 +307,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 304 |
|
| 305 |
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
| 306 |
lastScrapped = scrapped;
|
| 307 |
-
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 308 |
continue;
|
| 309 |
}
|
| 310 |
|
|
@@ -330,6 +333,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 330 |
}
|
| 331 |
|
| 332 |
if (!lastScrapped) {
|
|
|
|
|
|
|
|
|
|
| 333 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 334 |
}
|
| 335 |
|
|
|
|
| 276 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 277 |
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
| 278 |
lastScrapped = scrapped;
|
| 279 |
+
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 280 |
continue;
|
| 281 |
}
|
| 282 |
|
|
|
|
| 287 |
return formatted;
|
| 288 |
}
|
| 289 |
|
| 290 |
+
if (chargeAmount && scrapped?.pdfs?.length) {
|
| 291 |
return formatted;
|
| 292 |
}
|
| 293 |
}
|
| 294 |
|
| 295 |
if (!lastScrapped) {
|
| 296 |
+
if (crawlOpts.targetSelector) {
|
| 297 |
+
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
|
| 298 |
+
}
|
| 299 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 300 |
}
|
| 301 |
|
|
|
|
| 307 |
|
| 308 |
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
| 309 |
lastScrapped = scrapped;
|
| 310 |
+
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 311 |
continue;
|
| 312 |
}
|
| 313 |
|
|
|
|
| 333 |
}
|
| 334 |
|
| 335 |
if (!lastScrapped) {
|
| 336 |
+
if (crawlOpts.targetSelector) {
|
| 337 |
+
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
|
| 338 |
+
}
|
| 339 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 340 |
}
|
| 341 |
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -78,7 +78,9 @@ export class JSDomControl extends AsyncService {
|
|
| 78 |
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
| 79 |
}
|
| 80 |
|
|
|
|
| 81 |
if (Array.isArray(options?.targetSelector)) {
|
|
|
|
| 82 |
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 83 |
x.forEach((el) => {
|
| 84 |
if (!allNodes.includes(el)) {
|
|
@@ -87,6 +89,7 @@ export class JSDomControl extends AsyncService {
|
|
| 87 |
});
|
| 88 |
}
|
| 89 |
} else if (options?.targetSelector) {
|
|
|
|
| 90 |
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
| 91 |
if (!allNodes.includes(el)) {
|
| 92 |
allNodes.push(el);
|
|
@@ -97,6 +100,11 @@ export class JSDomControl extends AsyncService {
|
|
| 97 |
}
|
| 98 |
|
| 99 |
if (!allNodes.length) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
return snapshot;
|
| 101 |
}
|
| 102 |
const textChunks: string[] = [];
|
|
|
|
| 78 |
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
| 79 |
}
|
| 80 |
|
| 81 |
+
let bewareTargetContentDoesNotExist = false;
|
| 82 |
if (Array.isArray(options?.targetSelector)) {
|
| 83 |
+
bewareTargetContentDoesNotExist = true;
|
| 84 |
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
| 85 |
x.forEach((el) => {
|
| 86 |
if (!allNodes.includes(el)) {
|
|
|
|
| 89 |
});
|
| 90 |
}
|
| 91 |
} else if (options?.targetSelector) {
|
| 92 |
+
bewareTargetContentDoesNotExist = true;
|
| 93 |
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
| 94 |
if (!allNodes.includes(el)) {
|
| 95 |
allNodes.push(el);
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
if (!allNodes.length) {
|
| 103 |
+
|
| 104 |
+
if (bewareTargetContentDoesNotExist) {
|
| 105 |
+
return undefined;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
return snapshot;
|
| 109 |
}
|
| 110 |
const textChunks: string[] = [];
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 4532694d769f75aabffa465565d6427a544c0d6a
|