Spaces:
Build error
Build error
fix: several crash cases
Browse files- src/api/crawler.ts +6 -2
- src/services/pdf-extract.ts +21 -21
- src/services/puppeteer.ts +2 -2
- src/services/snapshot-formatter.ts +45 -25
src/api/crawler.ts
CHANGED
|
@@ -8,6 +8,7 @@ import {
|
|
| 8 |
AssertionFailureError, ParamValidationError,
|
| 9 |
RawString,
|
| 10 |
ApplicationError,
|
|
|
|
| 11 |
} from 'civkit/civ-rpc';
|
| 12 |
import { marshalErrorLike } from 'civkit/lang';
|
| 13 |
import { Defer } from 'civkit/defer';
|
|
@@ -817,7 +818,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 817 |
}
|
| 818 |
} catch (err: any) {
|
| 819 |
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
| 820 |
-
if (err instanceof ApplicationError &&
|
|
|
|
|
|
|
|
|
|
| 821 |
throw err;
|
| 822 |
}
|
| 823 |
}
|
|
@@ -968,7 +972,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 968 |
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
| 969 |
}
|
| 970 |
for (const s of crawlOpts.targetSelector) {
|
| 971 |
-
for (const e of s.split(',').map((x)=> x.trim())) {
|
| 972 |
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
|
| 973 |
throw new ParamValidationError({
|
| 974 |
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
|
|
|
|
| 8 |
AssertionFailureError, ParamValidationError,
|
| 9 |
RawString,
|
| 10 |
ApplicationError,
|
| 11 |
+
DataStreamBrokenError,
|
| 12 |
} from 'civkit/civ-rpc';
|
| 13 |
import { marshalErrorLike } from 'civkit/lang';
|
| 14 |
import { Defer } from 'civkit/defer';
|
|
|
|
| 818 |
}
|
| 819 |
} catch (err: any) {
|
| 820 |
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
| 821 |
+
if (err instanceof ApplicationError &&
|
| 822 |
+
!(err instanceof ServiceBadAttemptError) &&
|
| 823 |
+
!(err instanceof DataStreamBrokenError)
|
| 824 |
+
) {
|
| 825 |
throw err;
|
| 826 |
}
|
| 827 |
}
|
|
|
|
| 972 |
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
| 973 |
}
|
| 974 |
for (const s of crawlOpts.targetSelector) {
|
| 975 |
+
for (const e of s.split(',').map((x) => x.trim())) {
|
| 976 |
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
|
| 977 |
throw new ParamValidationError({
|
| 978 |
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
|
src/services/pdf-extract.ts
CHANGED
|
@@ -2,7 +2,7 @@ import 'core-js/actual/promise/with-resolvers';
|
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
import _ from 'lodash';
|
| 4 |
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
| 5 |
-
import { AsyncService, HashManager } from 'civkit';
|
| 6 |
import { GlobalLogger } from './logger';
|
| 7 |
import { PDFContent } from '../db/pdf';
|
| 8 |
import dayjs from 'dayjs';
|
|
@@ -325,27 +325,27 @@ export class PDFExtractor extends AsyncService {
|
|
| 325 |
|
| 326 |
try {
|
| 327 |
extracted = await this.extract(data);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
}
|
| 346 |
-
} catch (err) {
|
| 347 |
-
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
|
| 348 |
-
throw err;
|
| 349 |
}
|
| 350 |
|
| 351 |
return extracted;
|
|
|
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
import _ from 'lodash';
|
| 4 |
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
| 5 |
+
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
| 6 |
import { GlobalLogger } from './logger';
|
| 7 |
import { PDFContent } from '../db/pdf';
|
| 8 |
import dayjs from 'dayjs';
|
|
|
|
| 325 |
|
| 326 |
try {
|
| 327 |
extracted = await this.extract(data);
|
| 328 |
+
} catch (err: any) {
|
| 329 |
+
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
|
| 330 |
+
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
|
| 331 |
+
}
|
| 332 |
|
| 333 |
+
if (!this.asyncLocalContext.ctx.DNT) {
|
| 334 |
+
const theID = randomUUID();
|
| 335 |
+
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
| 336 |
+
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
| 337 |
+
PDFContent.save(
|
| 338 |
+
PDFContent.from({
|
| 339 |
+
_id: theID,
|
| 340 |
+
src: nameUrl,
|
| 341 |
+
meta: extracted?.meta || {},
|
| 342 |
+
urlDigest: digest,
|
| 343 |
+
createdAt: new Date(),
|
| 344 |
+
expireAt: new Date(Date.now() + this.cacheRetentionMs)
|
| 345 |
+
}).degradeForFireStore()
|
| 346 |
+
).catch((r) => {
|
| 347 |
+
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
|
| 348 |
+
});
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
}
|
| 350 |
|
| 351 |
return extracted;
|
src/services/puppeteer.ts
CHANGED
|
@@ -846,7 +846,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 846 |
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
| 847 |
const ctx = this.lifeCycleTrack.get(page);
|
| 848 |
if (proxy && ctx) {
|
| 849 |
-
return this.asyncLocalContext.bridge(ctx, async () => {
|
| 850 |
try {
|
| 851 |
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
| 852 |
...options,
|
|
@@ -890,7 +890,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 890 |
headers: _.omit(firstReq, 'result'),
|
| 891 |
}, 999);
|
| 892 |
} catch (err: any) {
|
| 893 |
-
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err
|
| 894 |
}
|
| 895 |
if (req.isInterceptResolutionHandled()) {
|
| 896 |
return;
|
|
|
|
| 846 |
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
| 847 |
const ctx = this.lifeCycleTrack.get(page);
|
| 848 |
if (proxy && ctx) {
|
| 849 |
+
return await this.asyncLocalContext.bridge(ctx, async () => {
|
| 850 |
try {
|
| 851 |
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
| 852 |
...options,
|
|
|
|
| 890 |
headers: _.omit(firstReq, 'result'),
|
| 891 |
}, 999);
|
| 892 |
} catch (err: any) {
|
| 893 |
+
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
| 894 |
}
|
| 895 |
if (req.isInterceptResolutionHandled()) {
|
| 896 |
return;
|
src/services/snapshot-formatter.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { randomUUID } from 'crypto';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
-
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
|
| 4 |
import TurndownService, { Filter, Rule } from 'turndown';
|
| 5 |
import { GlobalLogger } from './logger';
|
| 6 |
import { PageSnapshot } from './puppeteer';
|
|
@@ -406,7 +406,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 406 |
const text = snapshot.statusText || STATUS_CODES[code];
|
| 407 |
formatted.warning ??= '';
|
| 408 |
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
| 409 |
-
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
| 410 |
}
|
| 411 |
}
|
| 412 |
|
|
@@ -441,23 +441,23 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 441 |
formatted.warning ??= '';
|
| 442 |
if (snapshot.isIntermediate) {
|
| 443 |
const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
|
| 444 |
-
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
| 445 |
}
|
| 446 |
if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
|
| 447 |
const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
|
| 448 |
-
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
| 449 |
}
|
| 450 |
if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
|
| 451 |
const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
|
| 452 |
-
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
| 453 |
}
|
| 454 |
if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
|
| 455 |
const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
|
| 456 |
-
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
| 457 |
}
|
| 458 |
if (snapshot.isFromCache) {
|
| 459 |
const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
|
| 460 |
-
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
| 461 |
}
|
| 462 |
}
|
| 463 |
|
|
@@ -565,7 +565,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 565 |
const text = snapshot.statusText || STATUS_CODES[code];
|
| 566 |
mixin.warning ??= '';
|
| 567 |
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
| 568 |
-
mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`;
|
| 569 |
}
|
| 570 |
}
|
| 571 |
|
|
@@ -629,6 +629,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 629 |
}
|
| 630 |
}
|
| 631 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
turnDownService.addRule('improved-paragraph', {
|
| 633 |
filter: 'p',
|
| 634 |
replacement: (innerText) => {
|
|
@@ -751,27 +766,32 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 751 |
|
| 752 |
return snapshot;
|
| 753 |
}
|
| 754 |
-
|
| 755 |
-
if (
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
|
|
|
| 759 |
|
| 760 |
-
|
| 761 |
-
}
|
| 762 |
-
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
|
| 763 |
-
if ((await file.size) > 1024 * 1024 * 32) {
|
| 764 |
-
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
| 765 |
}
|
| 766 |
-
|
| 767 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 768 |
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
|
| 774 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 775 |
}
|
| 776 |
|
| 777 |
throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);
|
|
|
|
| 1 |
import { randomUUID } from 'crypto';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
+
import { AssertionFailureError, AsyncService, DataStreamBrokenError, FancyFile, HashManager, marshalErrorLike } from 'civkit';
|
| 4 |
import TurndownService, { Filter, Rule } from 'turndown';
|
| 5 |
import { GlobalLogger } from './logger';
|
| 6 |
import { PageSnapshot } from './puppeteer';
|
|
|
|
| 406 |
const text = snapshot.statusText || STATUS_CODES[code];
|
| 407 |
formatted.warning ??= '';
|
| 408 |
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
| 409 |
+
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
| 410 |
}
|
| 411 |
}
|
| 412 |
|
|
|
|
| 441 |
formatted.warning ??= '';
|
| 442 |
if (snapshot.isIntermediate) {
|
| 443 |
const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
|
| 444 |
+
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
| 445 |
}
|
| 446 |
if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
|
| 447 |
const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
|
| 448 |
+
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
| 449 |
}
|
| 450 |
if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
|
| 451 |
const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
|
| 452 |
+
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
| 453 |
}
|
| 454 |
if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
|
| 455 |
const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
|
| 456 |
+
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
| 457 |
}
|
| 458 |
if (snapshot.isFromCache) {
|
| 459 |
const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
|
| 460 |
+
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
| 461 |
}
|
| 462 |
}
|
| 463 |
|
|
|
|
| 565 |
const text = snapshot.statusText || STATUS_CODES[code];
|
| 566 |
mixin.warning ??= '';
|
| 567 |
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
| 568 |
+
mixin.warning = `${mixin.warning}${mixin.warning ? '\n' : ''}${msg}`;
|
| 569 |
}
|
| 570 |
}
|
| 571 |
|
|
|
|
| 629 |
}
|
| 630 |
}
|
| 631 |
|
| 632 |
+
turnDownService.addRule('improved-heading', {
|
| 633 |
+
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
|
| 634 |
+
replacement: (content, node, options) => {
|
| 635 |
+
const hLevel = Number(node.nodeName.charAt(1));
|
| 636 |
+
if (options.headingStyle === 'setext' && hLevel < 3) {
|
| 637 |
+
const underline = _.repeat((hLevel === 1 ? '=' : '-'), Math.min(128, content.length));
|
| 638 |
+
return (
|
| 639 |
+
'\n\n' + content + '\n' + underline + '\n\n'
|
| 640 |
+
);
|
| 641 |
+
} else {
|
| 642 |
+
return '\n\n' + _.repeat('#', hLevel) + ' ' + content + '\n\n';
|
| 643 |
+
}
|
| 644 |
+
}
|
| 645 |
+
});
|
| 646 |
+
|
| 647 |
turnDownService.addRule('improved-paragraph', {
|
| 648 |
filter: 'p',
|
| 649 |
replacement: (innerText) => {
|
|
|
|
| 766 |
|
| 767 |
return snapshot;
|
| 768 |
}
|
| 769 |
+
try {
|
| 770 |
+
if (contentType.startsWith('text/html')) {
|
| 771 |
+
if ((await file.size) > 1024 * 1024 * 32) {
|
| 772 |
+
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
| 773 |
+
}
|
| 774 |
+
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
|
| 775 |
|
| 776 |
+
return snapshot;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
}
|
| 778 |
+
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
|
| 779 |
+
if ((await file.size) > 1024 * 1024 * 32) {
|
| 780 |
+
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
| 781 |
+
}
|
| 782 |
+
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
|
| 783 |
+
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
| 784 |
|
| 785 |
+
return snapshot;
|
| 786 |
+
}
|
| 787 |
+
if (contentType.startsWith('application/pdf')) {
|
| 788 |
+
snapshot.pdfs = [pathToFileURL(await file.filePath).href];
|
| 789 |
|
| 790 |
+
return snapshot;
|
| 791 |
+
}
|
| 792 |
+
} catch (err: any) {
|
| 793 |
+
this.logger.warn(`Failed to read from file: ${url}`, { err, url });
|
| 794 |
+
throw new DataStreamBrokenError(`Failed to access ${url}: ${err?.message}`);
|
| 795 |
}
|
| 796 |
|
| 797 |
throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);
|