Spaces:
Build error
Build error
feat: pdf upload and ip check
Browse files- package-lock.json +4 -4
- package.json +1 -1
- src/api/crawler.ts +17 -60
- src/dto/crawler-options.ts +4 -3
- src/services/curl.ts +6 -5
- src/services/misc.ts +99 -0
- src/services/pdf-extract.ts +8 -8
- src/utils/ip.ts +167 -0
package-lock.json
CHANGED
|
@@ -17,7 +17,7 @@
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
-
"civkit": "^0.
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
@@ -3989,9 +3989,9 @@
|
|
| 3989 |
}
|
| 3990 |
},
|
| 3991 |
"node_modules/civkit": {
|
| 3992 |
-
"version": "0.
|
| 3993 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.
|
| 3994 |
-
"integrity": "sha512-
|
| 3995 |
"license": "AGPL",
|
| 3996 |
"dependencies": {
|
| 3997 |
"lodash": "^4.17.21",
|
|
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
+
"civkit": "^0.9.0-f7b0ca7",
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
|
|
| 3989 |
}
|
| 3990 |
},
|
| 3991 |
"node_modules/civkit": {
|
| 3992 |
+
"version": "0.9.0-f7b0ca7",
|
| 3993 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-f7b0ca7.tgz",
|
| 3994 |
+
"integrity": "sha512-WjF0zRY83Ewvx4fGs1O0PQD2Oyc/RlKCVGiO/LHdwEFwfldTqDE3XWdWv+brZ2GvsIsVVKVa+bEGP0SwJfrRXA==",
|
| 3995 |
"license": "AGPL",
|
| 3996 |
"dependencies": {
|
| 3997 |
"lodash": "^4.17.21",
|
package.json
CHANGED
|
@@ -26,7 +26,7 @@
|
|
| 26 |
"axios": "^1.3.3",
|
| 27 |
"bcrypt": "^5.1.0",
|
| 28 |
"busboy": "^1.6.0",
|
| 29 |
-
"civkit": "^0.
|
| 30 |
"core-js": "^3.37.1",
|
| 31 |
"cors": "^2.8.5",
|
| 32 |
"dayjs": "^1.11.9",
|
|
|
|
| 26 |
"axios": "^1.3.3",
|
| 27 |
"bcrypt": "^5.1.0",
|
| 28 |
"busboy": "^1.6.0",
|
| 29 |
+
"civkit": "^0.9.0-f7b0ca7",
|
| 30 |
"core-js": "^3.37.1",
|
| 31 |
"cors": "^2.8.5",
|
| 32 |
"dayjs": "^1.11.9",
|
src/api/crawler.ts
CHANGED
|
@@ -13,6 +13,7 @@ import {
|
|
| 13 |
import { marshalErrorLike } from 'civkit/lang';
|
| 14 |
import { Defer } from 'civkit/defer';
|
| 15 |
import { retryWith } from 'civkit/decorators';
|
|
|
|
| 16 |
|
| 17 |
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
| 18 |
|
|
@@ -43,10 +44,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
|
|
| 43 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 44 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 45 |
import { RobotsTxtService } from '../services/robots-text';
|
| 46 |
-
import {
|
| 47 |
-
import {
|
| 48 |
-
|
| 49 |
-
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
| 50 |
|
| 51 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 52 |
withIframe?: boolean | 'quoted';
|
|
@@ -92,6 +91,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 92 |
protected rateLimitControl: RateLimitControl,
|
| 93 |
protected threadLocal: AsyncLocalContext,
|
| 94 |
protected robotsTxtService: RobotsTxtService,
|
|
|
|
|
|
|
| 95 |
) {
|
| 96 |
super(...arguments);
|
| 97 |
|
|
@@ -472,47 +473,28 @@ export class CrawlerHost extends RPCHost {
|
|
| 472 |
}
|
| 473 |
|
| 474 |
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
| 475 |
-
let url: string;
|
| 476 |
|
| 477 |
const targetUrlFromGet = originPath.slice(1);
|
| 478 |
if (crawlerOptions.pdf) {
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
| 480 |
} else if (targetUrlFromGet) {
|
| 481 |
url = targetUrlFromGet.trim();
|
| 482 |
} else if (crawlerOptions.url) {
|
| 483 |
url = crawlerOptions.url.trim();
|
| 484 |
-
} else {
|
| 485 |
-
return null;
|
| 486 |
-
}
|
| 487 |
-
|
| 488 |
-
let result: URL;
|
| 489 |
-
try {
|
| 490 |
-
result = new URL(
|
| 491 |
-
normalizeUrl(
|
| 492 |
-
url,
|
| 493 |
-
{
|
| 494 |
-
stripWWW: false,
|
| 495 |
-
removeTrailingSlash: false,
|
| 496 |
-
removeSingleSlash: false,
|
| 497 |
-
sortQueryParameters: false,
|
| 498 |
-
}
|
| 499 |
-
)
|
| 500 |
-
);
|
| 501 |
-
} catch (err) {
|
| 502 |
-
throw new ParamValidationError({
|
| 503 |
-
message: `${err}`,
|
| 504 |
-
path: 'url'
|
| 505 |
-
});
|
| 506 |
}
|
| 507 |
|
| 508 |
-
if (!
|
| 509 |
throw new ParamValidationError({
|
| 510 |
-
message:
|
| 511 |
path: 'url'
|
| 512 |
});
|
| 513 |
}
|
| 514 |
|
| 515 |
-
|
| 516 |
if (this.puppeteerControl.circuitBreakerHosts.has(result.hostname.toLowerCase())) {
|
| 517 |
throw new SecurityCompromiseError({
|
| 518 |
message: `Circular hostname: ${result.protocol}`,
|
|
@@ -520,31 +502,6 @@ export class CrawlerHost extends RPCHost {
|
|
| 520 |
});
|
| 521 |
}
|
| 522 |
|
| 523 |
-
const isIp = isIP(result.hostname);
|
| 524 |
-
|
| 525 |
-
if (
|
| 526 |
-
(result.hostname === 'localhost') ||
|
| 527 |
-
(isIp && result.hostname.startsWith('127.'))
|
| 528 |
-
) {
|
| 529 |
-
throw new SecurityCompromiseError({
|
| 530 |
-
message: `Suspicious action: Request to localhost: ${result}`,
|
| 531 |
-
path: 'url'
|
| 532 |
-
});
|
| 533 |
-
}
|
| 534 |
-
|
| 535 |
-
if (!isIp && result.protocol !== 'blob:') {
|
| 536 |
-
await lookup(result.hostname).catch((err) => {
|
| 537 |
-
if (err.code === 'ENOTFOUND') {
|
| 538 |
-
return Promise.reject(new ParamValidationError({
|
| 539 |
-
message: `Domain '${result.hostname}' could not be resolved`,
|
| 540 |
-
path: 'url'
|
| 541 |
-
}));
|
| 542 |
-
}
|
| 543 |
-
|
| 544 |
-
return;
|
| 545 |
-
});
|
| 546 |
-
}
|
| 547 |
-
|
| 548 |
return result;
|
| 549 |
}
|
| 550 |
|
|
@@ -733,14 +690,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 733 |
}
|
| 734 |
|
| 735 |
if (crawlerOpts?.pdf) {
|
| 736 |
-
const
|
| 737 |
-
const
|
| 738 |
const snapshot = {
|
| 739 |
href: urlToCrawl.toString(),
|
| 740 |
-
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${
|
| 741 |
title: '',
|
| 742 |
text: '',
|
| 743 |
-
pdfs: [
|
| 744 |
} as PageSnapshot;
|
| 745 |
|
| 746 |
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
|
|
|
| 13 |
import { marshalErrorLike } from 'civkit/lang';
|
| 14 |
import { Defer } from 'civkit/defer';
|
| 15 |
import { retryWith } from 'civkit/decorators';
|
| 16 |
+
import { FancyFile } from 'civkit/fancy-file';
|
| 17 |
|
| 18 |
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
| 19 |
|
|
|
|
| 44 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 45 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 46 |
import { RobotsTxtService } from '../services/robots-text';
|
| 47 |
+
import { TempFileManager } from '../services/temp-file';
|
| 48 |
+
import { MiscService } from '../services/misc';
|
|
|
|
|
|
|
| 49 |
|
| 50 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 51 |
withIframe?: boolean | 'quoted';
|
|
|
|
| 91 |
protected rateLimitControl: RateLimitControl,
|
| 92 |
protected threadLocal: AsyncLocalContext,
|
| 93 |
protected robotsTxtService: RobotsTxtService,
|
| 94 |
+
protected tempFileManager: TempFileManager,
|
| 95 |
+
protected miscService: MiscService,
|
| 96 |
) {
|
| 97 |
super(...arguments);
|
| 98 |
|
|
|
|
| 473 |
}
|
| 474 |
|
| 475 |
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
| 476 |
+
let url: string = '';
|
| 477 |
|
| 478 |
const targetUrlFromGet = originPath.slice(1);
|
| 479 |
if (crawlerOptions.pdf) {
|
| 480 |
+
const pdfFile = crawlerOptions.pdf;
|
| 481 |
+
const identifier = pdfFile instanceof FancyFile ? (await pdfFile.sha256Sum) : randomUUID();
|
| 482 |
+
url = `blob://pdf/${identifier}`;
|
| 483 |
+
crawlerOptions.url ??= url;
|
| 484 |
} else if (targetUrlFromGet) {
|
| 485 |
url = targetUrlFromGet.trim();
|
| 486 |
} else if (crawlerOptions.url) {
|
| 487 |
url = crawlerOptions.url.trim();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
}
|
| 489 |
|
| 490 |
+
if (!url) {
|
| 491 |
throw new ParamValidationError({
|
| 492 |
+
message: 'No URL provided',
|
| 493 |
path: 'url'
|
| 494 |
});
|
| 495 |
}
|
| 496 |
|
| 497 |
+
const result = await this.miscService.assertNormalizedUrl(url);
|
| 498 |
if (this.puppeteerControl.circuitBreakerHosts.has(result.hostname.toLowerCase())) {
|
| 499 |
throw new SecurityCompromiseError({
|
| 500 |
message: `Circular hostname: ${result.protocol}`,
|
|
|
|
| 502 |
});
|
| 503 |
}
|
| 504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
return result;
|
| 506 |
}
|
| 507 |
|
|
|
|
| 690 |
}
|
| 691 |
|
| 692 |
if (crawlerOpts?.pdf) {
|
| 693 |
+
const pdfFile = crawlerOpts.pdf instanceof FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64'));
|
| 694 |
+
const pdfLocalPath = pathToFileURL((await pdfFile.filePath));
|
| 695 |
const snapshot = {
|
| 696 |
href: urlToCrawl.toString(),
|
| 697 |
+
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${crawlerOpts.url}"></body></html>`,
|
| 698 |
title: '',
|
| 699 |
text: '',
|
| 700 |
+
pdfs: [pdfLocalPath.href],
|
| 701 |
} as PageSnapshot;
|
| 702 |
|
| 703 |
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
src/dto/crawler-options.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit';
|
|
|
|
| 2 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 3 |
import { Context } from '../services/registry';
|
| 4 |
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
|
@@ -277,9 +278,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 277 |
|
| 278 |
@Prop({
|
| 279 |
desc: 'Base64 encoded PDF.',
|
| 280 |
-
type: [
|
| 281 |
})
|
| 282 |
-
pdf?:
|
| 283 |
|
| 284 |
@Prop({
|
| 285 |
default: CONTENT_FORMAT.CONTENT,
|
|
|
|
| 1 |
+
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
|
| 2 |
+
import { FancyFile } from 'civkit/fancy-file';
|
| 3 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 4 |
import { Context } from '../services/registry';
|
| 5 |
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
|
|
|
| 278 |
|
| 279 |
@Prop({
|
| 280 |
desc: 'Base64 encoded PDF.',
|
| 281 |
+
type: [FancyFile, String]
|
| 282 |
})
|
| 283 |
+
pdf?: FancyFile | string;
|
| 284 |
|
| 285 |
@Prop({
|
| 286 |
default: CONTENT_FORMAT.CONTENT,
|
src/services/curl.ts
CHANGED
|
@@ -109,6 +109,8 @@ export class CurlControl extends AsyncService {
|
|
| 109 |
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
|
| 110 |
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
|
| 111 |
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
|
|
|
|
|
|
|
| 112 |
if (crawlOpts?.method) {
|
| 113 |
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
|
| 114 |
}
|
|
@@ -401,12 +403,12 @@ export class CurlControl extends AsyncService {
|
|
| 401 |
digestCurlCode(code: CurlCode, msg: string) {
|
| 402 |
switch (code) {
|
| 403 |
// 400 User errors
|
| 404 |
-
case CurlCode.CURLE_COULDNT_RESOLVE_HOST:
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
}
|
| 408 |
|
| 409 |
// Maybe retry but dont retry with curl again
|
|
|
|
| 410 |
case CurlCode.CURLE_UNSUPPORTED_PROTOCOL:
|
| 411 |
case CurlCode.CURLE_PEER_FAILED_VERIFICATION: {
|
| 412 |
return new ServiceBadApproachError(msg);
|
|
@@ -417,7 +419,6 @@ export class CurlControl extends AsyncService {
|
|
| 417 |
case CurlCode.CURLE_SEND_ERROR:
|
| 418 |
case CurlCode.CURLE_RECV_ERROR:
|
| 419 |
case CurlCode.CURLE_GOT_NOTHING:
|
| 420 |
-
case CurlCode.CURLE_OPERATION_TIMEDOUT:
|
| 421 |
case CurlCode.CURLE_SSL_CONNECT_ERROR:
|
| 422 |
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
|
| 423 |
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
|
|
|
|
| 109 |
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
|
| 110 |
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
|
| 111 |
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
|
| 112 |
+
curl.setOpt(Curl.option.LOW_SPEED_LIMIT, 32768);
|
| 113 |
+
curl.setOpt(Curl.option.LOW_SPEED_TIME, 5_000);
|
| 114 |
if (crawlOpts?.method) {
|
| 115 |
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
|
| 116 |
}
|
|
|
|
| 403 |
digestCurlCode(code: CurlCode, msg: string) {
|
| 404 |
switch (code) {
|
| 405 |
// 400 User errors
|
| 406 |
+
case CurlCode.CURLE_COULDNT_RESOLVE_HOST: {
|
| 407 |
+
return new AssertionFailureError(msg);
|
| 408 |
+
}
|
|
|
|
| 409 |
|
| 410 |
// Maybe retry but dont retry with curl again
|
| 411 |
+
case CurlCode.CURLE_OPERATION_TIMEDOUT:
|
| 412 |
case CurlCode.CURLE_UNSUPPORTED_PROTOCOL:
|
| 413 |
case CurlCode.CURLE_PEER_FAILED_VERIFICATION: {
|
| 414 |
return new ServiceBadApproachError(msg);
|
|
|
|
| 419 |
case CurlCode.CURLE_SEND_ERROR:
|
| 420 |
case CurlCode.CURLE_RECV_ERROR:
|
| 421 |
case CurlCode.CURLE_GOT_NOTHING:
|
|
|
|
| 422 |
case CurlCode.CURLE_SSL_CONNECT_ERROR:
|
| 423 |
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
|
| 424 |
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
|
src/services/misc.ts
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { singleton } from 'tsyringe';
|
| 2 |
+
import { AsyncService } from 'civkit/async-service';
|
| 3 |
+
import { ParamValidationError } from 'civkit/civ-rpc';
|
| 4 |
+
import { SecurityCompromiseError } from '../shared/lib/errors';
|
| 5 |
+
import { isIP } from 'node:net';
|
| 6 |
+
import { isIPInNonPublicRange } from '../utils/ip';
|
| 7 |
+
import { GlobalLogger } from './logger';
|
| 8 |
+
import { lookup } from 'node:dns/promises';
|
| 9 |
+
import { Threaded } from './threaded';
|
| 10 |
+
|
| 11 |
+
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
| 12 |
+
|
| 13 |
+
@singleton()
|
| 14 |
+
export class MiscService extends AsyncService {
|
| 15 |
+
|
| 16 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 17 |
+
|
| 18 |
+
constructor(
|
| 19 |
+
protected globalLogger: GlobalLogger,
|
| 20 |
+
) {
|
| 21 |
+
super(...arguments);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
override async init() {
|
| 25 |
+
await this.dependencyReady();
|
| 26 |
+
|
| 27 |
+
this.emit('ready');
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
@Threaded()
|
| 31 |
+
async assertNormalizedUrl(input: string) {
|
| 32 |
+
let result: URL;
|
| 33 |
+
try {
|
| 34 |
+
result = new URL(
|
| 35 |
+
normalizeUrl(
|
| 36 |
+
input,
|
| 37 |
+
{
|
| 38 |
+
stripWWW: false,
|
| 39 |
+
removeTrailingSlash: false,
|
| 40 |
+
removeSingleSlash: false,
|
| 41 |
+
sortQueryParameters: false,
|
| 42 |
+
}
|
| 43 |
+
)
|
| 44 |
+
);
|
| 45 |
+
} catch (err) {
|
| 46 |
+
throw new ParamValidationError({
|
| 47 |
+
message: `${err}`,
|
| 48 |
+
path: 'url'
|
| 49 |
+
});
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
|
| 53 |
+
throw new ParamValidationError({
|
| 54 |
+
message: `Invalid protocol ${result.protocol}`,
|
| 55 |
+
path: 'url'
|
| 56 |
+
});
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
|
| 60 |
+
const isIp = isIP(normalizedHostname);
|
| 61 |
+
if (
|
| 62 |
+
(result.hostname === 'localhost') ||
|
| 63 |
+
(isIp && isIPInNonPublicRange(normalizedHostname))
|
| 64 |
+
) {
|
| 65 |
+
this.logger.warn(`Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, { href: result.href });
|
| 66 |
+
throw new SecurityCompromiseError({
|
| 67 |
+
message: `Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`,
|
| 68 |
+
path: 'url'
|
| 69 |
+
});
|
| 70 |
+
}
|
| 71 |
+
if (!isIp && result.protocol !== 'blob:') {
|
| 72 |
+
const resolved = await lookup(result.hostname, { all: true }).catch((err) => {
|
| 73 |
+
if (err.code === 'ENOTFOUND') {
|
| 74 |
+
return Promise.reject(new ParamValidationError({
|
| 75 |
+
message: `Domain '${result.hostname}' could not be resolved`,
|
| 76 |
+
path: 'url'
|
| 77 |
+
}));
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
return;
|
| 81 |
+
});
|
| 82 |
+
if (resolved) {
|
| 83 |
+
for (const x of resolved) {
|
| 84 |
+
if (isIPInNonPublicRange(x.address)) {
|
| 85 |
+
this.logger.warn(`Suspicious action: Domain resolved to non-public IP: ${result.hostname} => ${x.address}`, { href: result.href, ip: x.address });
|
| 86 |
+
throw new SecurityCompromiseError({
|
| 87 |
+
message: `Suspicious action: Domain resolved to non-public IP: ${x.address}`,
|
| 88 |
+
path: 'url'
|
| 89 |
+
});
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
return result;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
}
|
src/services/pdf-extract.ts
CHANGED
|
@@ -274,19 +274,19 @@ export class PDFExtractor extends AsyncService {
|
|
| 274 |
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
| 275 |
}
|
| 276 |
|
| 277 |
-
async cachedExtract(url: string
|
| 278 |
if (!url) {
|
| 279 |
return undefined;
|
| 280 |
}
|
| 281 |
-
|
| 282 |
const digest = md5Hasher.hash(nameUrl);
|
| 283 |
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
url = `dataurl://digest:${digest}`;
|
| 287 |
}
|
| 288 |
|
| 289 |
-
const cache: PDFContent | undefined =
|
|
|
|
| 290 |
|
| 291 |
if (cache) {
|
| 292 |
const age = Date.now() - cache?.createdAt.valueOf();
|
|
@@ -324,13 +324,13 @@ export class PDFExtractor extends AsyncService {
|
|
| 324 |
let extracted;
|
| 325 |
|
| 326 |
try {
|
| 327 |
-
extracted = await this.extract(
|
| 328 |
} catch (err: any) {
|
| 329 |
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
|
| 330 |
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
|
| 331 |
}
|
| 332 |
|
| 333 |
-
if (!this.asyncLocalContext.ctx.DNT) {
|
| 334 |
const theID = randomUUID();
|
| 335 |
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
| 336 |
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
|
|
|
| 274 |
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
| 275 |
}
|
| 276 |
|
| 277 |
+
async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
|
| 278 |
if (!url) {
|
| 279 |
return undefined;
|
| 280 |
}
|
| 281 |
+
let nameUrl = alternativeUrl || url;
|
| 282 |
const digest = md5Hasher.hash(nameUrl);
|
| 283 |
|
| 284 |
+
if (this.isDataUrl(url)) {
|
| 285 |
+
nameUrl = `blob://pdf:${digest}`;
|
|
|
|
| 286 |
}
|
| 287 |
|
| 288 |
+
const cache: PDFContent | undefined = nameUrl.startsWith('blob:') ? undefined :
|
| 289 |
+
(await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
| 290 |
|
| 291 |
if (cache) {
|
| 292 |
const age = Date.now() - cache?.createdAt.valueOf();
|
|
|
|
| 324 |
let extracted;
|
| 325 |
|
| 326 |
try {
|
| 327 |
+
extracted = await this.extract(url);
|
| 328 |
} catch (err: any) {
|
| 329 |
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
|
| 330 |
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
|
| 331 |
}
|
| 332 |
|
| 333 |
+
if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) {
|
| 334 |
const theID = randomUUID();
|
| 335 |
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
| 336 |
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
src/utils/ip.ts
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { isIPv4, isIPv6 } from 'net';
|
| 2 |
+
|
| 3 |
+
export function parseIp(ip: string): Buffer {
|
| 4 |
+
if (isIPv4(ip)) {
|
| 5 |
+
const [a, b, c, d] = ip.split('.').map(Number);
|
| 6 |
+
|
| 7 |
+
const buf = Buffer.alloc(4);
|
| 8 |
+
buf.writeUInt8(a, 0);
|
| 9 |
+
buf.writeUInt8(b, 1);
|
| 10 |
+
buf.writeUInt8(c, 2);
|
| 11 |
+
buf.writeUInt8(d, 3);
|
| 12 |
+
|
| 13 |
+
return buf;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
if (isIPv6(ip)) {
|
| 17 |
+
if (ip.includes('.')) {
|
| 18 |
+
const parts = ip.split(':');
|
| 19 |
+
const ipv4Part = parts.pop();
|
| 20 |
+
if (!ipv4Part) throw new Error('Invalid IPv6 address');
|
| 21 |
+
const ipv4Bytes = parseIp(ipv4Part);
|
| 22 |
+
parts.push('0');
|
| 23 |
+
const ipv6Bytes = parseIp(parts.join(':'));
|
| 24 |
+
ipv6Bytes.writeUInt32BE(ipv4Bytes.readUInt32BE(0), 12);
|
| 25 |
+
|
| 26 |
+
return ipv6Bytes;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
const buf = Buffer.alloc(16);
|
| 30 |
+
|
| 31 |
+
// Expand :: notation
|
| 32 |
+
let expanded = ip;
|
| 33 |
+
if (ip.includes('::')) {
|
| 34 |
+
const sides = ip.split('::');
|
| 35 |
+
const left = sides[0] ? sides[0].split(':') : [];
|
| 36 |
+
const right = sides[1] ? sides[1].split(':') : [];
|
| 37 |
+
const middle = Array(8 - left.length - right.length).fill('0');
|
| 38 |
+
expanded = [...left, ...middle, ...right].join(':');
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
// Convert to buffer
|
| 42 |
+
const parts = expanded.split(':');
|
| 43 |
+
let offset = 0;
|
| 44 |
+
for (const part of parts) {
|
| 45 |
+
buf.writeUInt16BE(parseInt(part, 16), offset);
|
| 46 |
+
offset += 2;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
return buf;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
throw new Error('Invalid IP address');
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
export function parseCIDR(cidr: string): [Buffer, Buffer] {
|
| 57 |
+
const [ip, prefixTxt] = cidr.split('/');
|
| 58 |
+
const buf = parseIp(ip);
|
| 59 |
+
const maskBuf = Buffer.alloc(buf.byteLength, 0xff);
|
| 60 |
+
const prefixBits = parseInt(prefixTxt);
|
| 61 |
+
|
| 62 |
+
let offsetBits = 0;
|
| 63 |
+
while (offsetBits < (buf.byteLength * 8)) {
|
| 64 |
+
if (offsetBits <= (prefixBits - 8)) {
|
| 65 |
+
offsetBits += 8;
|
| 66 |
+
continue;
|
| 67 |
+
}
|
| 68 |
+
const bitsRemain = prefixBits - offsetBits;
|
| 69 |
+
const byteOffset = Math.floor(offsetBits / 8);
|
| 70 |
+
|
| 71 |
+
if (bitsRemain > 0) {
|
| 72 |
+
const theByte = buf[byteOffset];
|
| 73 |
+
const mask = 0xff << (8 - bitsRemain);
|
| 74 |
+
maskBuf[byteOffset] = mask;
|
| 75 |
+
buf[byteOffset] = theByte & mask;
|
| 76 |
+
|
| 77 |
+
offsetBits += 8;
|
| 78 |
+
continue;
|
| 79 |
+
};
|
| 80 |
+
buf[byteOffset] = 0;
|
| 81 |
+
maskBuf[byteOffset] = 0;
|
| 82 |
+
|
| 83 |
+
offsetBits += 8;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
return [buf, maskBuf];
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
export class CIDR {
|
| 90 |
+
buff: Buffer;
|
| 91 |
+
mask: Buffer;
|
| 92 |
+
text: string;
|
| 93 |
+
constructor(cidr: string) {
|
| 94 |
+
this.text = cidr;
|
| 95 |
+
[this.buff, this.mask] = parseCIDR(cidr);
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
toString() {
|
| 99 |
+
return this.text;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
get family() {
|
| 103 |
+
return this.buff.byteLength === 4 ? 4 : 6;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
test(ip: string | Buffer): boolean {
|
| 107 |
+
const parsedIp = typeof ip === 'string' ? parseIp(ip) : ip;
|
| 108 |
+
|
| 109 |
+
if (parsedIp.byteLength !== this.buff.byteLength) {
|
| 110 |
+
return false;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
for (const i of Array(this.buff.byteLength).keys()) {
|
| 114 |
+
const t = parsedIp[i];
|
| 115 |
+
const m = this.mask[i];
|
| 116 |
+
|
| 117 |
+
if (m === 0) {
|
| 118 |
+
return true;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
const r = this.buff[i];
|
| 122 |
+
if ((t & m) !== r) {
|
| 123 |
+
return false;
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
return true;
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
const nonPublicNetworks4 = [
|
| 132 |
+
'10.0.0.0/8',
|
| 133 |
+
'172.16.0.0/12',
|
| 134 |
+
'192.168.0.0/16',
|
| 135 |
+
|
| 136 |
+
'127.0.0.0/8',
|
| 137 |
+
'255.255.255.255/32',
|
| 138 |
+
'169.254.0.0/16',
|
| 139 |
+
'224.0.0.0/4',
|
| 140 |
+
|
| 141 |
+
'100.64.0.0/10',
|
| 142 |
+
'240.0.0.0/4',
|
| 143 |
+
];
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
const nonPublicNetworks6 = [
|
| 147 |
+
'fc00::/7',
|
| 148 |
+
'fe80::/10',
|
| 149 |
+
'ff00::/8',
|
| 150 |
+
|
| 151 |
+
'::127.0.0.0/104',
|
| 152 |
+
'::/128',
|
| 153 |
+
];
|
| 154 |
+
|
| 155 |
+
const nonPublicCIDRs = [...nonPublicNetworks4, ...nonPublicNetworks6].map(cidr => new CIDR(cidr));
|
| 156 |
+
|
| 157 |
+
export function isIPInNonPublicRange(ip: string) {
|
| 158 |
+
const parsed = parseIp(ip);
|
| 159 |
+
|
| 160 |
+
for (const cidr of nonPublicCIDRs) {
|
| 161 |
+
if (cidr.test(parsed)) {
|
| 162 |
+
return true;
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
return false;
|
| 167 |
+
}
|