nomagick commited on
Commit
d0e20cc
·
unverified ·
1 Parent(s): 6b9e14d

fix: several crash cases

Browse files
src/api/crawler.ts CHANGED
@@ -8,6 +8,7 @@ import {
8
  AssertionFailureError, ParamValidationError,
9
  RawString,
10
  ApplicationError,
 
11
  } from 'civkit/civ-rpc';
12
  import { marshalErrorLike } from 'civkit/lang';
13
  import { Defer } from 'civkit/defer';
@@ -817,7 +818,10 @@ export class CrawlerHost extends RPCHost {
817
  }
818
  } catch (err: any) {
819
  this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
820
- if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
 
 
 
821
  throw err;
822
  }
823
  }
@@ -968,7 +972,7 @@ export class CrawlerHost extends RPCHost {
968
  crawlOpts.targetSelector = [crawlOpts.targetSelector];
969
  }
970
  for (const s of crawlOpts.targetSelector) {
971
- for (const e of s.split(',').map((x)=> x.trim())) {
972
  if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
973
  throw new ParamValidationError({
974
  message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
 
8
  AssertionFailureError, ParamValidationError,
9
  RawString,
10
  ApplicationError,
11
+ DataStreamBrokenError,
12
  } from 'civkit/civ-rpc';
13
  import { marshalErrorLike } from 'civkit/lang';
14
  import { Defer } from 'civkit/defer';
 
818
  }
819
  } catch (err: any) {
820
  this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
821
+ if (err instanceof ApplicationError &&
822
+ !(err instanceof ServiceBadAttemptError) &&
823
+ !(err instanceof DataStreamBrokenError)
824
+ ) {
825
  throw err;
826
  }
827
  }
 
972
  crawlOpts.targetSelector = [crawlOpts.targetSelector];
973
  }
974
  for (const s of crawlOpts.targetSelector) {
975
+ for (const e of s.split(',').map((x) => x.trim())) {
976
  if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
977
  throw new ParamValidationError({
978
  message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
src/services/pdf-extract.ts CHANGED
@@ -2,7 +2,7 @@ import 'core-js/actual/promise/with-resolvers';
2
  import { singleton } from 'tsyringe';
3
  import _ from 'lodash';
4
  import { TextItem } from 'pdfjs-dist/types/src/display/api';
5
- import { AsyncService, HashManager } from 'civkit';
6
  import { GlobalLogger } from './logger';
7
  import { PDFContent } from '../db/pdf';
8
  import dayjs from 'dayjs';
@@ -325,27 +325,27 @@ export class PDFExtractor extends AsyncService {
325
 
326
  try {
327
  extracted = await this.extract(data);
 
 
 
 
328
 
329
- if (!this.asyncLocalContext.ctx.DNT) {
330
- const theID = randomUUID();
331
- await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
332
- Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
333
- PDFContent.save(
334
- PDFContent.from({
335
- _id: theID,
336
- src: nameUrl,
337
- meta: extracted?.meta || {},
338
- urlDigest: digest,
339
- createdAt: new Date(),
340
- expireAt: new Date(Date.now() + this.cacheRetentionMs)
341
- }).degradeForFireStore()
342
- ).catch((r) => {
343
- this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
344
- });
345
- }
346
- } catch (err) {
347
- this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
348
- throw err;
349
  }
350
 
351
  return extracted;
 
2
  import { singleton } from 'tsyringe';
3
  import _ from 'lodash';
4
  import { TextItem } from 'pdfjs-dist/types/src/display/api';
5
+ import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
6
  import { GlobalLogger } from './logger';
7
  import { PDFContent } from '../db/pdf';
8
  import dayjs from 'dayjs';
 
325
 
326
  try {
327
  extracted = await this.extract(data);
328
+ } catch (err: any) {
329
+ this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
330
+ throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
331
+ }
332
 
333
+ if (!this.asyncLocalContext.ctx.DNT) {
334
+ const theID = randomUUID();
335
+ await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
336
+ Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
337
+ PDFContent.save(
338
+ PDFContent.from({
339
+ _id: theID,
340
+ src: nameUrl,
341
+ meta: extracted?.meta || {},
342
+ urlDigest: digest,
343
+ createdAt: new Date(),
344
+ expireAt: new Date(Date.now() + this.cacheRetentionMs)
345
+ }).degradeForFireStore()
346
+ ).catch((r) => {
347
+ this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
348
+ });
 
 
 
 
349
  }
350
 
351
  return extracted;
src/services/puppeteer.ts CHANGED
@@ -846,7 +846,7 @@ export class PuppeteerControl extends AsyncService {
846
  const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
847
  const ctx = this.lifeCycleTrack.get(page);
848
  if (proxy && ctx) {
849
- return this.asyncLocalContext.bridge(ctx, async () => {
850
  try {
851
  const curled = await this.curlControl.sideLoad(reqUrlParsed, {
852
  ...options,
@@ -890,7 +890,7 @@ export class PuppeteerControl extends AsyncService {
890
  headers: _.omit(firstReq, 'result'),
891
  }, 999);
892
  } catch (err: any) {
893
- this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
894
  }
895
  if (req.isInterceptResolutionHandled()) {
896
  return;
 
846
  const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
847
  const ctx = this.lifeCycleTrack.get(page);
848
  if (proxy && ctx) {
849
+ return await this.asyncLocalContext.bridge(ctx, async () => {
850
  try {
851
  const curled = await this.curlControl.sideLoad(reqUrlParsed, {
852
  ...options,
 
890
  headers: _.omit(firstReq, 'result'),
891
  }, 999);
892
  } catch (err: any) {
893
+ this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
894
  }
895
  if (req.isInterceptResolutionHandled()) {
896
  return;
src/services/snapshot-formatter.ts CHANGED
@@ -1,6 +1,6 @@
1
  import { randomUUID } from 'crypto';
2
  import { container, singleton } from 'tsyringe';
3
- import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
4
  import TurndownService, { Filter, Rule } from 'turndown';
5
  import { GlobalLogger } from './logger';
6
  import { PageSnapshot } from './puppeteer';
@@ -406,7 +406,7 @@ export class SnapshotFormatter extends AsyncService {
406
  const text = snapshot.statusText || STATUS_CODES[code];
407
  formatted.warning ??= '';
408
  const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
409
- formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
410
  }
411
  }
412
 
@@ -441,23 +441,23 @@ export class SnapshotFormatter extends AsyncService {
441
  formatted.warning ??= '';
442
  if (snapshot.isIntermediate) {
443
  const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
444
- formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
445
  }
446
  if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
447
  const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
448
- formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
449
  }
450
  if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
451
  const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
452
- formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
453
  }
454
  if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
455
  const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
456
- formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
457
  }
458
  if (snapshot.isFromCache) {
459
  const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
460
- formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
461
  }
462
  }
463
 
@@ -565,7 +565,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
565
  const text = snapshot.statusText || STATUS_CODES[code];
566
  mixin.warning ??= '';
567
  const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
568
- mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`;
569
  }
570
  }
571
 
@@ -629,6 +629,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
629
  }
630
  }
631
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
  turnDownService.addRule('improved-paragraph', {
633
  filter: 'p',
634
  replacement: (innerText) => {
@@ -751,27 +766,32 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
751
 
752
  return snapshot;
753
  }
754
- if (contentType.startsWith('text/html')) {
755
- if ((await file.size) > 1024 * 1024 * 32) {
756
- throw new AssertionFailureError(`Failed to access ${url}: file too large`);
757
- }
758
- snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
 
759
 
760
- return snapshot;
761
- }
762
- if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
763
- if ((await file.size) > 1024 * 1024 * 32) {
764
- throw new AssertionFailureError(`Failed to access ${url}: file too large`);
765
  }
766
- snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
767
- snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
 
 
 
 
768
 
769
- return snapshot;
770
- }
771
- if (contentType.startsWith('application/pdf')) {
772
- snapshot.pdfs = [pathToFileURL(await file.filePath).href];
773
 
774
- return snapshot;
 
 
 
 
775
  }
776
 
777
  throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);
 
1
  import { randomUUID } from 'crypto';
2
  import { container, singleton } from 'tsyringe';
3
+ import { AssertionFailureError, AsyncService, DataStreamBrokenError, FancyFile, HashManager, marshalErrorLike } from 'civkit';
4
  import TurndownService, { Filter, Rule } from 'turndown';
5
  import { GlobalLogger } from './logger';
6
  import { PageSnapshot } from './puppeteer';
 
406
  const text = snapshot.statusText || STATUS_CODES[code];
407
  formatted.warning ??= '';
408
  const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
409
+ formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
410
  }
411
  }
412
 
 
441
  formatted.warning ??= '';
442
  if (snapshot.isIntermediate) {
443
  const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
444
+ formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
445
  }
446
  if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
447
  const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
448
+ formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
449
  }
450
  if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
451
  const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
452
+ formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
453
  }
454
  if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
455
  const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
456
+ formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
457
  }
458
  if (snapshot.isFromCache) {
459
  const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
460
+ formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
461
  }
462
  }
463
 
 
565
  const text = snapshot.statusText || STATUS_CODES[code];
566
  mixin.warning ??= '';
567
  const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
568
+ mixin.warning = `${mixin.warning}${mixin.warning ? '\n' : ''}${msg}`;
569
  }
570
  }
571
 
 
629
  }
630
  }
631
 
632
+ turnDownService.addRule('improved-heading', {
633
+ filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
634
+ replacement: (content, node, options) => {
635
+ const hLevel = Number(node.nodeName.charAt(1));
636
+ if (options.headingStyle === 'setext' && hLevel < 3) {
637
+ const underline = _.repeat((hLevel === 1 ? '=' : '-'), Math.min(128, content.length));
638
+ return (
639
+ '\n\n' + content + '\n' + underline + '\n\n'
640
+ );
641
+ } else {
642
+ return '\n\n' + _.repeat('#', hLevel) + ' ' + content + '\n\n';
643
+ }
644
+ }
645
+ });
646
+
647
  turnDownService.addRule('improved-paragraph', {
648
  filter: 'p',
649
  replacement: (innerText) => {
 
766
 
767
  return snapshot;
768
  }
769
+ try {
770
+ if (contentType.startsWith('text/html')) {
771
+ if ((await file.size) > 1024 * 1024 * 32) {
772
+ throw new AssertionFailureError(`Failed to access ${url}: file too large`);
773
+ }
774
+ snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
775
 
776
+ return snapshot;
 
 
 
 
777
  }
778
+ if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
779
+ if ((await file.size) > 1024 * 1024 * 32) {
780
+ throw new AssertionFailureError(`Failed to access ${url}: file too large`);
781
+ }
782
+ snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
783
+ snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
784
 
785
+ return snapshot;
786
+ }
787
+ if (contentType.startsWith('application/pdf')) {
788
+ snapshot.pdfs = [pathToFileURL(await file.filePath).href];
789
 
790
+ return snapshot;
791
+ }
792
+ } catch (err: any) {
793
+ this.logger.warn(`Failed to read from file: ${url}`, { err, url });
794
+ throw new DataStreamBrokenError(`Failed to access ${url}: ${err?.message}`);
795
  }
796
 
797
  throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);