Spaces:
Build error
Build error
feat: extract text from pdf (#70)
Browse files* feat: pdf
* fix
* fix
- backend/firebase.json +0 -3
- backend/functions/package-lock.json +24 -0
- backend/functions/package.json +2 -0
- backend/functions/src/cloud-functions/crawler.ts +180 -138
- backend/functions/src/cloud-functions/data-crunching.ts +289 -0
- backend/functions/src/cloud-functions/searcher.ts +8 -25
- backend/functions/src/db/pdf.ts +65 -0
- backend/functions/src/dto/scrapping-options.ts +110 -0
- backend/functions/src/services/alt-text.ts +0 -1
- backend/functions/src/services/pdf-extract.ts +298 -0
- backend/functions/src/services/puppeteer.ts +37 -21
- backend/functions/tsconfig.json +2 -1
- thinapps-shared +1 -1
backend/firebase.json
CHANGED
|
@@ -33,9 +33,6 @@
|
|
| 33 |
"functions": {
|
| 34 |
"port": 5001
|
| 35 |
},
|
| 36 |
-
"auth": {
|
| 37 |
-
"port": 9099
|
| 38 |
-
},
|
| 39 |
"firestore": {
|
| 40 |
"port": 9098
|
| 41 |
},
|
|
|
|
| 33 |
"functions": {
|
| 34 |
"port": 5001
|
| 35 |
},
|
|
|
|
|
|
|
|
|
|
| 36 |
"firestore": {
|
| 37 |
"port": 9098
|
| 38 |
},
|
backend/functions/package-lock.json
CHANGED
|
@@ -15,6 +15,7 @@
|
|
| 15 |
"axios": "^1.3.3",
|
| 16 |
"bcrypt": "^5.1.0",
|
| 17 |
"civkit": "^0.6.5-047c0d8",
|
|
|
|
| 18 |
"cors": "^2.8.5",
|
| 19 |
"dayjs": "^1.11.9",
|
| 20 |
"express": "^4.19.2",
|
|
@@ -27,6 +28,7 @@
|
|
| 27 |
"maxmind": "^4.3.18",
|
| 28 |
"minio": "^7.1.3",
|
| 29 |
"openai": "^4.20.0",
|
|
|
|
| 30 |
"puppeteer": "^22.7.1",
|
| 31 |
"puppeteer-extra": "^3.3.6",
|
| 32 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
|
@@ -3923,6 +3925,16 @@
|
|
| 3923 |
"integrity": "sha512-3DdaFaU/Zf1AnpLiFDeNCD4TOWe3Zl2RZaTzUvWiIk5ERzcCodOE20Vqq4fzCbNoHURFHT4/us/Lfq+S2zyY4w==",
|
| 3924 |
"optional": true
|
| 3925 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3926 |
"node_modules/core-util-is": {
|
| 3927 |
"version": "1.0.3",
|
| 3928 |
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
|
|
@@ -9208,6 +9220,18 @@
|
|
| 9208 |
"node": ">=8"
|
| 9209 |
}
|
| 9210 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9211 |
"node_modules/pend": {
|
| 9212 |
"version": "1.2.0",
|
| 9213 |
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
|
|
|
| 15 |
"axios": "^1.3.3",
|
| 16 |
"bcrypt": "^5.1.0",
|
| 17 |
"civkit": "^0.6.5-047c0d8",
|
| 18 |
+
"core-js": "^3.37.1",
|
| 19 |
"cors": "^2.8.5",
|
| 20 |
"dayjs": "^1.11.9",
|
| 21 |
"express": "^4.19.2",
|
|
|
|
| 28 |
"maxmind": "^4.3.18",
|
| 29 |
"minio": "^7.1.3",
|
| 30 |
"openai": "^4.20.0",
|
| 31 |
+
"pdfjs-dist": "^4.2.67",
|
| 32 |
"puppeteer": "^22.7.1",
|
| 33 |
"puppeteer-extra": "^3.3.6",
|
| 34 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
|
|
|
| 3925 |
"integrity": "sha512-3DdaFaU/Zf1AnpLiFDeNCD4TOWe3Zl2RZaTzUvWiIk5ERzcCodOE20Vqq4fzCbNoHURFHT4/us/Lfq+S2zyY4w==",
|
| 3926 |
"optional": true
|
| 3927 |
},
|
| 3928 |
+
"node_modules/core-js": {
|
| 3929 |
+
"version": "3.37.1",
|
| 3930 |
+
"resolved": "https://registry.npmjs.org/core-js/-/core-js-3.37.1.tgz",
|
| 3931 |
+
"integrity": "sha512-Xn6qmxrQZyB0FFY8E3bgRXei3lWDJHhvI+u0q9TKIYM49G8pAr0FgnnrFRAmsbptZL1yxRADVXn+x5AGsbBfyw==",
|
| 3932 |
+
"hasInstallScript": true,
|
| 3933 |
+
"funding": {
|
| 3934 |
+
"type": "opencollective",
|
| 3935 |
+
"url": "https://opencollective.com/core-js"
|
| 3936 |
+
}
|
| 3937 |
+
},
|
| 3938 |
"node_modules/core-util-is": {
|
| 3939 |
"version": "1.0.3",
|
| 3940 |
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
|
|
|
|
| 9220 |
"node": ">=8"
|
| 9221 |
}
|
| 9222 |
},
|
| 9223 |
+
"node_modules/pdfjs-dist": {
|
| 9224 |
+
"version": "4.2.67",
|
| 9225 |
+
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz",
|
| 9226 |
+
"integrity": "sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==",
|
| 9227 |
+
"engines": {
|
| 9228 |
+
"node": ">=18"
|
| 9229 |
+
},
|
| 9230 |
+
"optionalDependencies": {
|
| 9231 |
+
"canvas": "^2.11.2",
|
| 9232 |
+
"path2d": "^0.2.0"
|
| 9233 |
+
}
|
| 9234 |
+
},
|
| 9235 |
"node_modules/pend": {
|
| 9236 |
"version": "1.2.0",
|
| 9237 |
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
backend/functions/package.json
CHANGED
|
@@ -35,6 +35,7 @@
|
|
| 35 |
"axios": "^1.3.3",
|
| 36 |
"bcrypt": "^5.1.0",
|
| 37 |
"civkit": "^0.6.5-047c0d8",
|
|
|
|
| 38 |
"cors": "^2.8.5",
|
| 39 |
"dayjs": "^1.11.9",
|
| 40 |
"express": "^4.19.2",
|
|
@@ -47,6 +48,7 @@
|
|
| 47 |
"maxmind": "^4.3.18",
|
| 48 |
"minio": "^7.1.3",
|
| 49 |
"openai": "^4.20.0",
|
|
|
|
| 50 |
"puppeteer": "^22.7.1",
|
| 51 |
"puppeteer-extra": "^3.3.6",
|
| 52 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
|
|
|
| 35 |
"axios": "^1.3.3",
|
| 36 |
"bcrypt": "^5.1.0",
|
| 37 |
"civkit": "^0.6.5-047c0d8",
|
| 38 |
+
"core-js": "^3.37.1",
|
| 39 |
"cors": "^2.8.5",
|
| 40 |
"dayjs": "^1.11.9",
|
| 41 |
"express": "^4.19.2",
|
|
|
|
| 48 |
"maxmind": "^4.3.18",
|
| 49 |
"minio": "^7.1.3",
|
| 50 |
"openai": "^4.20.0",
|
| 51 |
+
"pdfjs-dist": "^4.2.67",
|
| 52 |
"puppeteer": "^22.7.1",
|
| 53 |
"puppeteer-extra": "^3.3.6",
|
| 54 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -10,18 +10,18 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
| 12 |
import { Request, Response } from 'express';
|
| 13 |
-
|
| 14 |
import { AltTextService } from '../services/alt-text';
|
| 15 |
import TurndownService from 'turndown';
|
| 16 |
-
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 17 |
-
import type { CookieParam } from 'puppeteer';
|
| 18 |
import { Crawled } from '../db/crawled';
|
| 19 |
import { cleanAttribute } from '../utils/misc';
|
| 20 |
import { randomUUID } from 'crypto';
|
| 21 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 22 |
|
| 23 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
|
|
|
| 24 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
|
|
|
| 25 |
|
| 26 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 27 |
|
|
@@ -69,6 +69,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 69 |
protected globalLogger: Logger,
|
| 70 |
protected puppeteerControl: PuppeteerControl,
|
| 71 |
protected altTextService: AltTextService,
|
|
|
|
| 72 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 73 |
protected rateLimitControl: RateLimitControl,
|
| 74 |
protected threadLocal: AsyncContext,
|
|
@@ -76,7 +77,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 76 |
super(...arguments);
|
| 77 |
|
| 78 |
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
|
| 79 |
-
if (!snapshot.title?.trim()) {
|
| 80 |
return;
|
| 81 |
}
|
| 82 |
if (options.cookies?.length) {
|
|
@@ -138,14 +139,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 138 |
});
|
| 139 |
turnDownService.addRule('improved-inline-link', {
|
| 140 |
filter: function (node, options) {
|
| 141 |
-
return (
|
| 142 |
options.linkStyle === 'inlined' &&
|
| 143 |
node.nodeName === 'A' &&
|
| 144 |
node.getAttribute('href')
|
| 145 |
);
|
| 146 |
},
|
| 147 |
|
| 148 |
-
replacement: function (content, node) {
|
| 149 |
let href = node.getAttribute('href');
|
| 150 |
if (href) href = href.replace(/([()])/g, '\\$1');
|
| 151 |
let title = cleanAttribute(node.getAttribute('title'));
|
|
@@ -226,6 +227,26 @@ export class CrawlerHost extends RPCHost {
|
|
| 226 |
}
|
| 227 |
} as FormattedPage;
|
| 228 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
if (mode === 'text') {
|
| 230 |
return {
|
| 231 |
...this.getGeneralSnapshotMixins(snapshot),
|
|
@@ -236,101 +257,108 @@ export class CrawlerHost extends RPCHost {
|
|
| 236 |
} as FormattedPage;
|
| 237 |
}
|
| 238 |
|
| 239 |
-
|
| 240 |
-
let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
|
| 241 |
-
for (const plugin of this.turnDownPlugins) {
|
| 242 |
-
turnDownService = turnDownService.use(plugin);
|
| 243 |
-
}
|
| 244 |
-
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 245 |
-
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
| 246 |
-
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
| 247 |
-
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 248 |
-
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 249 |
-
return undefined;
|
| 250 |
-
});
|
| 251 |
-
if (r && x.src) {
|
| 252 |
-
urlToAltMap[x.src.trim()] = r;
|
| 253 |
-
}
|
| 254 |
-
});
|
| 255 |
-
|
| 256 |
-
await Promise.all(tasks);
|
| 257 |
-
}
|
| 258 |
-
let imgIdx = 0;
|
| 259 |
const imageSummary = {} as { [k: string]: string; };
|
| 260 |
const imageIdxTrack = new Map<string, number[]>();
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
}
|
| 270 |
-
}
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
idxArr.push(imgSerial);
|
| 286 |
-
imageIdxTrack.set(src, idxArr);
|
| 287 |
|
| 288 |
-
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
-
|
| 292 |
-
|
| 293 |
|
| 294 |
-
|
|
|
|
| 295 |
|
| 296 |
-
|
| 297 |
-
}
|
| 298 |
-
});
|
| 299 |
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 306 |
-
const vanillaTurnDownService = this.getTurndown();
|
| 307 |
try {
|
| 308 |
-
contentText =
|
| 309 |
-
} catch (
|
| 310 |
-
this.logger.warn(`Turndown failed to run,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
}
|
| 312 |
}
|
| 313 |
-
}
|
| 314 |
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
try {
|
| 320 |
-
contentText = turnDownService.turndown(snapshot.html);
|
| 321 |
-
} catch (err) {
|
| 322 |
-
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 323 |
-
const vanillaTurnDownService = this.getTurndown();
|
| 324 |
try {
|
| 325 |
-
contentText =
|
| 326 |
-
} catch (
|
| 327 |
-
this.logger.warn(`Turndown failed to run,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
}
|
| 329 |
}
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
}
|
| 334 |
|
| 335 |
const cleanText = (contentText || '').trim();
|
| 336 |
|
|
@@ -514,7 +542,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 514 |
req: Request,
|
| 515 |
res: Response,
|
| 516 |
},
|
| 517 |
-
auth: JinaEmbeddingsAuthDTO
|
|
|
|
| 518 |
) {
|
| 519 |
const uid = await auth.solveUID();
|
| 520 |
let chargeAmount = 0;
|
|
@@ -571,6 +600,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 571 |
}
|
| 572 |
|
| 573 |
let urlToCrawl;
|
|
|
|
| 574 |
try {
|
| 575 |
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
| 576 |
} catch (err) {
|
|
@@ -586,58 +616,19 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 586 |
});
|
| 587 |
}
|
| 588 |
|
| 589 |
-
const
|
| 590 |
-
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 591 |
-
const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
|
| 592 |
-
const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
|
| 593 |
-
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 594 |
-
let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
| 595 |
-
if (isNaN(cacheTolerance)) {
|
| 596 |
-
cacheTolerance = this.cacheValidMs;
|
| 597 |
-
if (noCache) {
|
| 598 |
-
cacheTolerance = 0;
|
| 599 |
-
}
|
| 600 |
-
}
|
| 601 |
-
const targetSelector = ctx.req.get('x-target-selector') || undefined;
|
| 602 |
-
const waitForSelector = ctx.req.get('x-wait-for-selector') || targetSelector;
|
| 603 |
-
const cookies: CookieParam[] = [];
|
| 604 |
-
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 605 |
-
if (Array.isArray(setCookieHeaders)) {
|
| 606 |
-
for (const setCookie of setCookieHeaders) {
|
| 607 |
-
cookies.push({
|
| 608 |
-
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
|
| 609 |
-
domain: urlToCrawl.hostname,
|
| 610 |
-
});
|
| 611 |
-
}
|
| 612 |
-
} else if (setCookieHeaders) {
|
| 613 |
-
cookies.push({
|
| 614 |
-
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
| 615 |
-
domain: urlToCrawl.hostname,
|
| 616 |
-
});
|
| 617 |
-
}
|
| 618 |
-
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
| 619 |
-
this.threadLocal.set('withLinksSummary', withLinksSummary);
|
| 620 |
-
this.threadLocal.set('withImagesSummary', withImagesSummary);
|
| 621 |
-
|
| 622 |
-
const crawlOpts: ExtraScrappingOptions = {
|
| 623 |
-
proxyUrl: ctx.req.get('x-proxy-url'),
|
| 624 |
-
cookies,
|
| 625 |
-
favorScreenshot: customMode === 'screenshot',
|
| 626 |
-
waitForSelector,
|
| 627 |
-
targetSelector,
|
| 628 |
-
};
|
| 629 |
|
| 630 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 631 |
const sseStream = new OutputServerEventStream();
|
| 632 |
rpcReflect.return(sseStream);
|
| 633 |
|
| 634 |
try {
|
| 635 |
-
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
| 636 |
if (!scrapped) {
|
| 637 |
continue;
|
| 638 |
}
|
| 639 |
|
| 640 |
-
const formatted = await this.formatSnapshot(
|
| 641 |
chargeAmount = this.getChargeAmount(formatted);
|
| 642 |
sseStream.write({
|
| 643 |
event: 'data',
|
|
@@ -659,13 +650,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 659 |
|
| 660 |
let lastScrapped;
|
| 661 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 662 |
-
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
| 663 |
lastScrapped = scrapped;
|
| 664 |
-
if (waitForSelector || !scrapped?.parsed?.content || !
|
| 665 |
continue;
|
| 666 |
}
|
| 667 |
|
| 668 |
-
const formatted = await this.formatSnapshot(
|
| 669 |
chargeAmount = this.getChargeAmount(formatted);
|
| 670 |
|
| 671 |
return formatted;
|
|
@@ -675,21 +666,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 675 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 676 |
}
|
| 677 |
|
| 678 |
-
const formatted = await this.formatSnapshot(
|
| 679 |
chargeAmount = this.getChargeAmount(formatted);
|
| 680 |
|
| 681 |
return formatted;
|
| 682 |
}
|
| 683 |
|
| 684 |
-
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
| 685 |
lastScrapped = scrapped;
|
| 686 |
-
if (waitForSelector || !scrapped?.parsed?.content || !
|
| 687 |
continue;
|
| 688 |
}
|
| 689 |
|
| 690 |
-
const formatted = await this.formatSnapshot(
|
| 691 |
chargeAmount = this.getChargeAmount(formatted);
|
| 692 |
-
if (
|
| 693 |
|
| 694 |
return assignTransferProtocolMeta(`${formatted}`,
|
| 695 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
|
@@ -703,9 +694,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 703 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 704 |
}
|
| 705 |
|
| 706 |
-
const formatted = await this.formatSnapshot(
|
| 707 |
chargeAmount = this.getChargeAmount(formatted);
|
| 708 |
-
if (
|
| 709 |
|
| 710 |
return assignTransferProtocolMeta(`${formatted}`,
|
| 711 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
|
@@ -915,4 +906,55 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 915 |
}
|
| 916 |
}
|
| 917 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 918 |
}
|
|
|
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
| 12 |
import { Request, Response } from 'express';
|
| 13 |
+
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
| 14 |
import { AltTextService } from '../services/alt-text';
|
| 15 |
import TurndownService from 'turndown';
|
|
|
|
|
|
|
| 16 |
import { Crawled } from '../db/crawled';
|
| 17 |
import { cleanAttribute } from '../utils/misc';
|
| 18 |
import { randomUUID } from 'crypto';
|
| 19 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 20 |
|
| 21 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 22 |
+
import { CrawlerOptions } from '../dto/scrapping-options';
|
| 23 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 24 |
+
import { PDFExtractor } from '../services/pdf-extract';
|
| 25 |
|
| 26 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 27 |
|
|
|
|
| 69 |
protected globalLogger: Logger,
|
| 70 |
protected puppeteerControl: PuppeteerControl,
|
| 71 |
protected altTextService: AltTextService,
|
| 72 |
+
protected pdfExtractor: PDFExtractor,
|
| 73 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 74 |
protected rateLimitControl: RateLimitControl,
|
| 75 |
protected threadLocal: AsyncContext,
|
|
|
|
| 77 |
super(...arguments);
|
| 78 |
|
| 79 |
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
|
| 80 |
+
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
| 81 |
return;
|
| 82 |
}
|
| 83 |
if (options.cookies?.length) {
|
|
|
|
| 139 |
});
|
| 140 |
turnDownService.addRule('improved-inline-link', {
|
| 141 |
filter: function (node, options) {
|
| 142 |
+
return Boolean(
|
| 143 |
options.linkStyle === 'inlined' &&
|
| 144 |
node.nodeName === 'A' &&
|
| 145 |
node.getAttribute('href')
|
| 146 |
);
|
| 147 |
},
|
| 148 |
|
| 149 |
+
replacement: function (content, node: any) {
|
| 150 |
let href = node.getAttribute('href');
|
| 151 |
if (href) href = href.replace(/([()])/g, '\\$1');
|
| 152 |
let title = cleanAttribute(node.getAttribute('title'));
|
|
|
|
| 227 |
}
|
| 228 |
} as FormattedPage;
|
| 229 |
}
|
| 230 |
+
|
| 231 |
+
let pdfMode = false;
|
| 232 |
+
if (snapshot.pdfs?.length && !snapshot.title) {
|
| 233 |
+
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0]);
|
| 234 |
+
if (pdf) {
|
| 235 |
+
pdfMode = true;
|
| 236 |
+
snapshot.title = pdf.meta?.Title;
|
| 237 |
+
snapshot.text = pdf.text || snapshot.text;
|
| 238 |
+
snapshot.parsed = {
|
| 239 |
+
content: pdf.content,
|
| 240 |
+
textContent: pdf.content,
|
| 241 |
+
length: pdf.content?.length,
|
| 242 |
+
byline: pdf.meta?.Author,
|
| 243 |
+
lang: pdf.meta?.Language || undefined,
|
| 244 |
+
title: pdf.meta?.Title,
|
| 245 |
+
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
| 246 |
+
};
|
| 247 |
+
}
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
if (mode === 'text') {
|
| 251 |
return {
|
| 252 |
...this.getGeneralSnapshotMixins(snapshot),
|
|
|
|
| 257 |
} as FormattedPage;
|
| 258 |
}
|
| 259 |
|
| 260 |
+
let contentText = '';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
const imageSummary = {} as { [k: string]: string; };
|
| 262 |
const imageIdxTrack = new Map<string, number[]>();
|
| 263 |
+
do {
|
| 264 |
+
if (pdfMode) {
|
| 265 |
+
contentText = snapshot.parsed?.content || snapshot.text;
|
| 266 |
+
break;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
|
| 270 |
+
let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
|
| 271 |
+
for (const plugin of this.turnDownPlugins) {
|
| 272 |
+
turnDownService = turnDownService.use(plugin);
|
| 273 |
+
}
|
| 274 |
+
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 275 |
+
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
| 276 |
+
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
| 277 |
+
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 278 |
+
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 279 |
+
return undefined;
|
| 280 |
+
});
|
| 281 |
+
if (r && x.src) {
|
| 282 |
+
urlToAltMap[x.src.trim()] = r;
|
| 283 |
}
|
| 284 |
+
});
|
| 285 |
|
| 286 |
+
await Promise.all(tasks);
|
| 287 |
+
}
|
| 288 |
+
let imgIdx = 0;
|
| 289 |
+
turnDownService.addRule('img-generated-alt', {
|
| 290 |
+
filter: 'img',
|
| 291 |
+
replacement: (_content, node: any) => {
|
| 292 |
+
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
| 293 |
+
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
| 294 |
+
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
| 295 |
+
if (dataSrc && !dataSrc.startsWith('data:')) {
|
| 296 |
+
linkPreferredSrc = dataSrc;
|
| 297 |
+
}
|
| 298 |
+
}
|
|
|
|
|
|
|
| 299 |
|
| 300 |
+
let src;
|
| 301 |
+
try {
|
| 302 |
+
src = new URL(linkPreferredSrc, nominalUrl).toString();
|
| 303 |
+
} catch (_err) {
|
| 304 |
+
void 0;
|
| 305 |
+
}
|
| 306 |
+
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 307 |
+
if (!src) {
|
| 308 |
+
return '';
|
| 309 |
+
}
|
| 310 |
+
const mapped = urlToAltMap[src];
|
| 311 |
+
const imgSerial = ++imgIdx;
|
| 312 |
+
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
| 313 |
+
idxArr.push(imgSerial);
|
| 314 |
+
imageIdxTrack.set(src, idxArr);
|
| 315 |
|
| 316 |
+
if (mapped) {
|
| 317 |
+
imageSummary[src] = mapped || alt;
|
| 318 |
|
| 319 |
+
return ``;
|
| 320 |
+
}
|
| 321 |
|
| 322 |
+
imageSummary[src] = alt || '';
|
|
|
|
|
|
|
| 323 |
|
| 324 |
+
return alt ? `` : ``;
|
| 325 |
+
}
|
| 326 |
+
});
|
| 327 |
+
|
| 328 |
+
if (toBeTurnedToMd) {
|
|
|
|
|
|
|
| 329 |
try {
|
| 330 |
+
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 331 |
+
} catch (err) {
|
| 332 |
+
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 333 |
+
const vanillaTurnDownService = this.getTurndown();
|
| 334 |
+
try {
|
| 335 |
+
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
| 336 |
+
} catch (err2) {
|
| 337 |
+
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 338 |
+
}
|
| 339 |
}
|
| 340 |
}
|
|
|
|
| 341 |
|
| 342 |
+
if (
|
| 343 |
+
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
| 344 |
+
&& toBeTurnedToMd !== snapshot.html
|
| 345 |
+
) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
try {
|
| 347 |
+
contentText = turnDownService.turndown(snapshot.html);
|
| 348 |
+
} catch (err) {
|
| 349 |
+
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 350 |
+
const vanillaTurnDownService = this.getTurndown();
|
| 351 |
+
try {
|
| 352 |
+
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
| 353 |
+
} catch (err2) {
|
| 354 |
+
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 355 |
+
}
|
| 356 |
}
|
| 357 |
}
|
| 358 |
+
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
| 359 |
+
contentText = snapshot.text;
|
| 360 |
+
}
|
| 361 |
+
} while (false);
|
| 362 |
|
| 363 |
const cleanText = (contentText || '').trim();
|
| 364 |
|
|
|
|
| 542 |
req: Request,
|
| 543 |
res: Response,
|
| 544 |
},
|
| 545 |
+
auth: JinaEmbeddingsAuthDTO,
|
| 546 |
+
crawlerOptions: CrawlerOptions,
|
| 547 |
) {
|
| 548 |
const uid = await auth.solveUID();
|
| 549 |
let chargeAmount = 0;
|
|
|
|
| 600 |
}
|
| 601 |
|
| 602 |
let urlToCrawl;
|
| 603 |
+
const normalizeUrl = (await pNormalizeUrl).default;
|
| 604 |
try {
|
| 605 |
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
| 606 |
} catch (err) {
|
|
|
|
| 616 |
});
|
| 617 |
}
|
| 618 |
|
| 619 |
+
const crawlOpts = this.configure(crawlerOptions);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
|
| 621 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 622 |
const sseStream = new OutputServerEventStream();
|
| 623 |
rpcReflect.return(sseStream);
|
| 624 |
|
| 625 |
try {
|
| 626 |
+
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
| 627 |
if (!scrapped) {
|
| 628 |
continue;
|
| 629 |
}
|
| 630 |
|
| 631 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 632 |
chargeAmount = this.getChargeAmount(formatted);
|
| 633 |
sseStream.write({
|
| 634 |
event: 'data',
|
|
|
|
| 650 |
|
| 651 |
let lastScrapped;
|
| 652 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 653 |
+
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
| 654 |
lastScrapped = scrapped;
|
| 655 |
+
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 656 |
continue;
|
| 657 |
}
|
| 658 |
|
| 659 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 660 |
chargeAmount = this.getChargeAmount(formatted);
|
| 661 |
|
| 662 |
return formatted;
|
|
|
|
| 666 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 667 |
}
|
| 668 |
|
| 669 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
| 670 |
chargeAmount = this.getChargeAmount(formatted);
|
| 671 |
|
| 672 |
return formatted;
|
| 673 |
}
|
| 674 |
|
| 675 |
+
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
| 676 |
lastScrapped = scrapped;
|
| 677 |
+
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
| 678 |
continue;
|
| 679 |
}
|
| 680 |
|
| 681 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 682 |
chargeAmount = this.getChargeAmount(formatted);
|
| 683 |
+
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 684 |
|
| 685 |
return assignTransferProtocolMeta(`${formatted}`,
|
| 686 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
|
|
|
| 694 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 695 |
}
|
| 696 |
|
| 697 |
+
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
| 698 |
chargeAmount = this.getChargeAmount(formatted);
|
| 699 |
+
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 700 |
|
| 701 |
return assignTransferProtocolMeta(`${formatted}`,
|
| 702 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
|
|
|
| 906 |
}
|
| 907 |
}
|
| 908 |
|
| 909 |
+
configure(opts: CrawlerOptions) {
|
| 910 |
+
|
| 911 |
+
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
| 912 |
+
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
| 913 |
+
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
| 914 |
+
|
| 915 |
+
const crawlOpts: ExtraScrappingOptions = {
|
| 916 |
+
proxyUrl: opts.proxyUrl,
|
| 917 |
+
cookies: opts.setCookies,
|
| 918 |
+
favorScreenshot: opts.respondWith === 'screenshot',
|
| 919 |
+
waitForSelector: opts.waitForSelector,
|
| 920 |
+
targetSelector: opts.targetSelector,
|
| 921 |
+
};
|
| 922 |
+
|
| 923 |
+
return crawlOpts;
|
| 924 |
+
}
|
| 925 |
+
|
| 926 |
+
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
| 927 |
+
const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
|
| 928 |
+
|
| 929 |
+
let lastSnapshot;
|
| 930 |
+
let goodEnough = false;
|
| 931 |
+
try {
|
| 932 |
+
for await (const x of it) {
|
| 933 |
+
lastSnapshot = x;
|
| 934 |
+
|
| 935 |
+
if (goodEnough) {
|
| 936 |
+
break;
|
| 937 |
+
}
|
| 938 |
+
|
| 939 |
+
if (lastSnapshot?.parsed?.content) {
|
| 940 |
+
// After it's good enough, wait for next snapshot;
|
| 941 |
+
goodEnough = true;
|
| 942 |
+
}
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
} catch (err) {
|
| 946 |
+
if (lastSnapshot) {
|
| 947 |
+
return this.formatSnapshot(mode, lastSnapshot, url);
|
| 948 |
+
}
|
| 949 |
+
|
| 950 |
+
throw err;
|
| 951 |
+
}
|
| 952 |
+
|
| 953 |
+
if (!lastSnapshot) {
|
| 954 |
+
throw new AssertionFailureError(`No content available`);
|
| 955 |
+
}
|
| 956 |
+
|
| 957 |
+
return this.formatSnapshot(mode, lastSnapshot, url);
|
| 958 |
+
}
|
| 959 |
+
|
| 960 |
}
|
backend/functions/src/cloud-functions/data-crunching.ts
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import {
|
| 2 |
+
Defer,
|
| 3 |
+
PromiseThrottle,
|
| 4 |
+
RPCHost,
|
| 5 |
+
RPCReflection,
|
| 6 |
+
} from 'civkit';
|
| 7 |
+
import { singleton } from 'tsyringe';
|
| 8 |
+
import { CloudScheduleV2, CloudTaskV2, FirebaseStorageBucketControl, Logger, OutputServerEventStream, Param, RPCReflect, TempFileManager } from '../shared';
|
| 9 |
+
import _ from 'lodash';
|
| 10 |
+
import { CrawlerHost } from './crawler';
|
| 11 |
+
|
| 12 |
+
import { Crawled } from '../db/crawled';
|
| 13 |
+
import dayjs from 'dayjs';
|
| 14 |
+
import { createReadStream } from 'fs';
|
| 15 |
+
import { appendFile } from 'fs/promises';
|
| 16 |
+
import { createGzip } from 'zlib';
|
| 17 |
+
import { getFunctions } from 'firebase-admin/functions';
|
| 18 |
+
import { GoogleAuth } from 'google-auth-library';
|
| 19 |
+
|
| 20 |
+
dayjs.extend(require('dayjs/plugin/utc'));
|
| 21 |
+
|
| 22 |
+
/**
|
| 23 |
+
* Get the URL of a given v2 cloud function.
|
| 24 |
+
*
|
| 25 |
+
* @param {string} name the function's name
|
| 26 |
+
* @param {string} location the function's location
|
| 27 |
+
* @return {Promise<string>} The URL of the function
|
| 28 |
+
*/
|
| 29 |
+
async function getFunctionUrl(name: string, location = "us-central1") {
|
| 30 |
+
const projectId = `reader-6b7dc`;
|
| 31 |
+
const url = "https://cloudfunctions.googleapis.com/v2beta/" +
|
| 32 |
+
`projects/${projectId}/locations/${location}/functions/${name}`;
|
| 33 |
+
const auth = new GoogleAuth({
|
| 34 |
+
scopes: 'https://www.googleapis.com/auth/cloud-platform',
|
| 35 |
+
});
|
| 36 |
+
const client = await auth.getClient();
|
| 37 |
+
const res = await client.request<any>({ url });
|
| 38 |
+
const uri = res.data?.serviceConfig?.uri;
|
| 39 |
+
if (!uri) {
|
| 40 |
+
throw new Error(`Unable to retreive uri for function at ${url}`);
|
| 41 |
+
}
|
| 42 |
+
return uri;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
@singleton()
|
| 46 |
+
export class DataCrunchingHost extends RPCHost {
|
| 47 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 48 |
+
|
| 49 |
+
pageCacheCrunchingPrefix = 'crunched-pages';
|
| 50 |
+
pageCacheCrunchingBatchSize = 5000;
|
| 51 |
+
pageCacheCrunchingTMinus = 6 * 24 * 60 * 60 * 1000;
|
| 52 |
+
rev = 7;
|
| 53 |
+
|
| 54 |
+
constructor(
|
| 55 |
+
protected globalLogger: Logger,
|
| 56 |
+
|
| 57 |
+
protected crawler: CrawlerHost,
|
| 58 |
+
protected tempFileManager: TempFileManager,
|
| 59 |
+
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 60 |
+
) {
|
| 61 |
+
super(..._.without(arguments, crawler));
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
override async init() {
|
| 65 |
+
await this.dependencyReady();
|
| 66 |
+
|
| 67 |
+
this.emit('ready');
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
@CloudTaskV2({
|
| 71 |
+
runtime: {
|
| 72 |
+
cpu: 2,
|
| 73 |
+
memory: '4GiB',
|
| 74 |
+
timeoutSeconds: 3600,
|
| 75 |
+
concurrency: 2,
|
| 76 |
+
maxInstances: 200,
|
| 77 |
+
retryConfig: {
|
| 78 |
+
maxAttempts: 3,
|
| 79 |
+
minBackoffSeconds: 60,
|
| 80 |
+
},
|
| 81 |
+
rateLimits: {
|
| 82 |
+
maxConcurrentDispatches: 150,
|
| 83 |
+
maxDispatchesPerSecond: 2,
|
| 84 |
+
},
|
| 85 |
+
},
|
| 86 |
+
tags: ['DataCrunching'],
|
| 87 |
+
})
|
| 88 |
+
async crunchPageCacheWorker(
|
| 89 |
+
@Param('date') date: string,
|
| 90 |
+
@Param('offset', { default: 0 }) offset: number
|
| 91 |
+
) {
|
| 92 |
+
this.logger.info(`Crunching page cache @${date}+${offset}...`);
|
| 93 |
+
for await (const { fileName, records } of this.iterPageCacheRecords(date, offset)) {
|
| 94 |
+
this.logger.info(`Crunching ${fileName}...`);
|
| 95 |
+
const fileOnDrive = await this.crunchCacheRecords(records);
|
| 96 |
+
const fstream = createReadStream(fileOnDrive.path);
|
| 97 |
+
const gzipStream = createGzip();
|
| 98 |
+
fstream.pipe(gzipStream, { end: true });
|
| 99 |
+
await this.firebaseObjectStorage.bucket.file(fileName).save(gzipStream, {
|
| 100 |
+
contentType: 'application/jsonl+gzip',
|
| 101 |
+
});
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
this.logger.info(`Crunching page cache @${date}+${offset} done.`);
|
| 105 |
+
|
| 106 |
+
return true;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
@CloudScheduleV2('2 0 * * *', {
|
| 110 |
+
name: 'crunchPageCacheEveryday',
|
| 111 |
+
runtime: {
|
| 112 |
+
cpu: 2,
|
| 113 |
+
memory: '4GiB',
|
| 114 |
+
timeoutSeconds: 1800,
|
| 115 |
+
timeZone: 'UTC',
|
| 116 |
+
retryCount: 3,
|
| 117 |
+
minBackoffSeconds: 60,
|
| 118 |
+
},
|
| 119 |
+
tags: ['DataCrunching'],
|
| 120 |
+
})
|
| 121 |
+
// @CloudHTTPv2({
|
| 122 |
+
// runtime: {
|
| 123 |
+
// cpu: 2,
|
| 124 |
+
// memory: '4GiB',
|
| 125 |
+
// timeoutSeconds: 3600,
|
| 126 |
+
// concurrency: 2,
|
| 127 |
+
// maxInstances: 200,
|
| 128 |
+
// },
|
| 129 |
+
// tags: ['DataCrunching'],
|
| 130 |
+
// })
|
| 131 |
+
async dispatchPageCacheCrunching(
|
| 132 |
+
@RPCReflect() rpcReflect: RPCReflection,
|
| 133 |
+
) {
|
| 134 |
+
const sse = new OutputServerEventStream({ highWaterMark: 4096 });
|
| 135 |
+
rpcReflect.return(sse);
|
| 136 |
+
rpcReflect.catch((err) => {
|
| 137 |
+
sse.end({ data: `Error: ${err.message}` });
|
| 138 |
+
});
|
| 139 |
+
for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
|
| 140 |
+
this.logger.info(`Dispatching ${fileName}...`);
|
| 141 |
+
sse.write({ data: `Dispatching ${fileName}...` });
|
| 142 |
+
|
| 143 |
+
await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
|
| 144 |
+
dispatchDeadlineSeconds: 1800,
|
| 145 |
+
uri: await getFunctionUrl('crunchPageCacheWorker'),
|
| 146 |
+
});
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
sse.end({ data: 'done' });
|
| 150 |
+
sse.resume();
|
| 151 |
+
|
| 152 |
+
return true;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
|
| 156 |
+
const startOfToday = dayjs().utc().startOf('day');
|
| 157 |
+
const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
|
| 158 |
+
let theDay = startingPoint;
|
| 159 |
+
|
| 160 |
+
if (date) {
|
| 161 |
+
theDay = dayjs(date).utc().startOf('day');
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
let counter = 0;
|
| 165 |
+
if (inputOffset) {
|
| 166 |
+
counter = parseInt(inputOffset as string, 10);
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
while (theDay.isBefore(startOfToday)) {
|
| 170 |
+
const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
|
| 171 |
+
const offset = counter;
|
| 172 |
+
counter += this.pageCacheCrunchingBatchSize;
|
| 173 |
+
const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
|
| 174 |
+
if (fileExists) {
|
| 175 |
+
continue;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
const records = await Crawled.fromFirestoreQuery(Crawled.COLLECTION
|
| 179 |
+
.where('createdAt', '>=', theDay.toDate())
|
| 180 |
+
.where('createdAt', '<', theDay.add(1, 'day').toDate())
|
| 181 |
+
.orderBy('createdAt', 'asc')
|
| 182 |
+
.offset(offset)
|
| 183 |
+
.limit(this.pageCacheCrunchingBatchSize)
|
| 184 |
+
);
|
| 185 |
+
|
| 186 |
+
this.logger.info(`Found ${records.length} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
|
| 187 |
+
|
| 188 |
+
if (!records.length) {
|
| 189 |
+
if (date) {
|
| 190 |
+
break;
|
| 191 |
+
}
|
| 192 |
+
theDay = theDay.add(1, 'day');
|
| 193 |
+
counter = 0;
|
| 194 |
+
continue;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
yield { fileName, records };
|
| 198 |
+
|
| 199 |
+
if (offset) {
|
| 200 |
+
break;
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
async* iterPageCacheChunks() {
|
| 206 |
+
const startOfToday = dayjs().utc().startOf('day');
|
| 207 |
+
const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
|
| 208 |
+
let theDay = startingPoint;
|
| 209 |
+
|
| 210 |
+
let counter = 0;
|
| 211 |
+
|
| 212 |
+
while (theDay.isBefore(startOfToday)) {
|
| 213 |
+
const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
|
| 214 |
+
const offset = counter;
|
| 215 |
+
counter += this.pageCacheCrunchingBatchSize;
|
| 216 |
+
const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
|
| 217 |
+
if (fileExists) {
|
| 218 |
+
continue;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
const nRecords = (await Crawled.COLLECTION
|
| 222 |
+
.where('createdAt', '>=', theDay.toDate())
|
| 223 |
+
.where('createdAt', '<', theDay.add(1, 'day').toDate())
|
| 224 |
+
.orderBy('createdAt', 'asc')
|
| 225 |
+
.offset(offset)
|
| 226 |
+
.limit(this.pageCacheCrunchingBatchSize)
|
| 227 |
+
.count().get()).data().count;
|
| 228 |
+
|
| 229 |
+
this.logger.info(`Found ${nRecords} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
|
| 230 |
+
if (nRecords < this.pageCacheCrunchingBatchSize) {
|
| 231 |
+
theDay = theDay.add(1, 'day');
|
| 232 |
+
counter = 0;
|
| 233 |
+
}
|
| 234 |
+
if (nRecords) {
|
| 235 |
+
yield { fileName, date: theDay.toISOString(), offset };
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
continue;
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
async crunchCacheRecords(records: Crawled[]) {
|
| 243 |
+
const throttle = new PromiseThrottle(30);
|
| 244 |
+
const localFilePath = this.tempFileManager.alloc();
|
| 245 |
+
let nextDrainDeferred = Defer();
|
| 246 |
+
nextDrainDeferred.resolve();
|
| 247 |
+
|
| 248 |
+
for (const record of records) {
|
| 249 |
+
await throttle.acquire();
|
| 250 |
+
this.firebaseObjectStorage.downloadFile(`snapshots/${record._id}`)
|
| 251 |
+
.then(async (snapshotTxt) => {
|
| 252 |
+
try {
|
| 253 |
+
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
|
| 254 |
+
|
| 255 |
+
let formatted = await this.crawler.formatSnapshot('default', snapshot);
|
| 256 |
+
if (!formatted.content) {
|
| 257 |
+
formatted = await this.crawler.formatSnapshot('markdown', snapshot);
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
await nextDrainDeferred.promise;
|
| 261 |
+
await appendFile(localFilePath, JSON.stringify({
|
| 262 |
+
url: snapshot.href,
|
| 263 |
+
title: snapshot.title || '',
|
| 264 |
+
html: snapshot.html || '',
|
| 265 |
+
text: snapshot.text || '',
|
| 266 |
+
content: formatted.content || '',
|
| 267 |
+
}) + '\n', { encoding: 'utf-8' });
|
| 268 |
+
|
| 269 |
+
} catch (err) {
|
| 270 |
+
this.logger.warn(`Failed to parse snapshot for ${record._id}`, { err });
|
| 271 |
+
}
|
| 272 |
+
})
|
| 273 |
+
.finally(() => {
|
| 274 |
+
throttle.release();
|
| 275 |
+
});
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
await throttle.nextDrain();
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
const ro = {
|
| 282 |
+
path: localFilePath
|
| 283 |
+
};
|
| 284 |
+
|
| 285 |
+
this.tempFileManager.bindPathTo(ro, localFilePath);
|
| 286 |
+
|
| 287 |
+
return ro;
|
| 288 |
+
}
|
| 289 |
+
}
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -19,6 +19,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
| 19 |
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 20 |
import { SearchResult } from '../db/searched';
|
| 21 |
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
@singleton()
|
|
@@ -145,7 +146,8 @@ export class SearcherHost extends RPCHost {
|
|
| 145 |
req: Request,
|
| 146 |
res: Response,
|
| 147 |
},
|
| 148 |
-
auth: JinaEmbeddingsAuthDTO
|
|
|
|
| 149 |
) {
|
| 150 |
const uid = await auth.solveUID();
|
| 151 |
let chargeAmount = 0;
|
|
@@ -201,18 +203,7 @@ export class SearcherHost extends RPCHost {
|
|
| 201 |
});
|
| 202 |
}
|
| 203 |
|
| 204 |
-
const
|
| 205 |
-
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 206 |
-
const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
|
| 207 |
-
const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
|
| 208 |
-
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 209 |
-
let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
| 210 |
-
if (isNaN(pageCacheTolerance)) {
|
| 211 |
-
pageCacheTolerance = this.pageCacheToleranceMs;
|
| 212 |
-
if (noCache) {
|
| 213 |
-
pageCacheTolerance = 0;
|
| 214 |
-
}
|
| 215 |
-
}
|
| 216 |
const cookies: CookieParam[] = [];
|
| 217 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 218 |
if (Array.isArray(setCookieHeaders)) {
|
|
@@ -226,27 +217,19 @@ export class SearcherHost extends RPCHost {
|
|
| 226 |
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
| 227 |
});
|
| 228 |
}
|
| 229 |
-
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
| 230 |
-
this.threadLocal.set('withLinksSummary', withLinksSummary);
|
| 231 |
-
this.threadLocal.set('withImagesSummary', withImagesSummary);
|
| 232 |
-
|
| 233 |
-
const crawlOpts: ScrappingOptions = {
|
| 234 |
-
proxyUrl: ctx.req.get('x-proxy-url'),
|
| 235 |
-
cookies,
|
| 236 |
-
favorScreenshot: customMode === 'screenshot'
|
| 237 |
-
};
|
| 238 |
-
|
| 239 |
const searchQuery = noSlashPath;
|
| 240 |
const r = await this.cachedWebSearch({
|
| 241 |
q: searchQuery,
|
| 242 |
count: 10
|
| 243 |
-
}, noCache);
|
| 244 |
|
| 245 |
if (!r.web?.results.length) {
|
| 246 |
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
| 247 |
}
|
| 248 |
|
| 249 |
-
const it = this.fetchSearchResults(
|
|
|
|
|
|
|
| 250 |
|
| 251 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 252 |
const sseStream = new OutputServerEventStream();
|
|
|
|
| 19 |
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 20 |
import { SearchResult } from '../db/searched';
|
| 21 |
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
| 22 |
+
import { CrawlerOptions } from '../dto/scrapping-options';
|
| 23 |
|
| 24 |
|
| 25 |
@singleton()
|
|
|
|
| 146 |
req: Request,
|
| 147 |
res: Response,
|
| 148 |
},
|
| 149 |
+
auth: JinaEmbeddingsAuthDTO,
|
| 150 |
+
crawlerOptions: CrawlerOptions,
|
| 151 |
) {
|
| 152 |
const uid = await auth.solveUID();
|
| 153 |
let chargeAmount = 0;
|
|
|
|
| 203 |
});
|
| 204 |
}
|
| 205 |
|
| 206 |
+
const crawlOpts = this.crawler.configure(crawlerOptions);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
const cookies: CookieParam[] = [];
|
| 208 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 209 |
if (Array.isArray(setCookieHeaders)) {
|
|
|
|
| 217 |
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
| 218 |
});
|
| 219 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
const searchQuery = noSlashPath;
|
| 221 |
const r = await this.cachedWebSearch({
|
| 222 |
q: searchQuery,
|
| 223 |
count: 10
|
| 224 |
+
}, crawlerOptions.noCache);
|
| 225 |
|
| 226 |
if (!r.web?.results.length) {
|
| 227 |
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
| 228 |
}
|
| 229 |
|
| 230 |
+
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 231 |
+
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
| 232 |
+
);
|
| 233 |
|
| 234 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 235 |
const sseStream = new OutputServerEventStream();
|
backend/functions/src/db/pdf.ts
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Also, Prop, parseJSONText } from 'civkit';
|
| 2 |
+
import { FirestoreRecord } from '../shared/lib/firestore';
|
| 3 |
+
import _ from 'lodash';
|
| 4 |
+
|
| 5 |
+
@Also({
|
| 6 |
+
dictOf: Object
|
| 7 |
+
})
|
| 8 |
+
export class PDFContent extends FirestoreRecord {
|
| 9 |
+
static override collectionName = 'pdfs';
|
| 10 |
+
|
| 11 |
+
override _id!: string;
|
| 12 |
+
|
| 13 |
+
@Prop({
|
| 14 |
+
required: true
|
| 15 |
+
})
|
| 16 |
+
src!: string;
|
| 17 |
+
|
| 18 |
+
@Prop({
|
| 19 |
+
required: true
|
| 20 |
+
})
|
| 21 |
+
urlDigest!: string;
|
| 22 |
+
|
| 23 |
+
@Prop()
|
| 24 |
+
meta?: { [k: string]: any; };
|
| 25 |
+
|
| 26 |
+
@Prop()
|
| 27 |
+
text?: string;
|
| 28 |
+
|
| 29 |
+
@Prop()
|
| 30 |
+
content?: string;
|
| 31 |
+
|
| 32 |
+
@Prop()
|
| 33 |
+
createdAt!: Date;
|
| 34 |
+
|
| 35 |
+
@Prop()
|
| 36 |
+
expireAt?: Date;
|
| 37 |
+
|
| 38 |
+
static patchedFields = [
|
| 39 |
+
'meta'
|
| 40 |
+
];
|
| 41 |
+
|
| 42 |
+
static override from(input: any) {
|
| 43 |
+
for (const field of this.patchedFields) {
|
| 44 |
+
if (typeof input[field] === 'string') {
|
| 45 |
+
input[field] = parseJSONText(input[field]);
|
| 46 |
+
}
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
return super.from(input) as PDFContent;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
override degradeForFireStore() {
|
| 53 |
+
const copy: any = { ...this };
|
| 54 |
+
|
| 55 |
+
for (const field of (this.constructor as typeof PDFContent).patchedFields) {
|
| 56 |
+
if (typeof copy[field] === 'object') {
|
| 57 |
+
copy[field] = JSON.stringify(copy[field]) as any;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
return copy;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
[k: string]: any;
|
| 65 |
+
}
|
backend/functions/src/dto/scrapping-options.ts
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
| 2 |
+
import type { Request, Response } from 'express';
|
| 3 |
+
import type { CookieParam } from 'puppeteer';
|
| 4 |
+
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 5 |
+
|
| 6 |
+
export class CrawlerOptions extends AutoCastable {
|
| 7 |
+
|
| 8 |
+
@Prop({
|
| 9 |
+
default: 'default',
|
| 10 |
+
})
|
| 11 |
+
respondWith!: string;
|
| 12 |
+
|
| 13 |
+
@Prop({
|
| 14 |
+
default: false,
|
| 15 |
+
})
|
| 16 |
+
withGeneratedAlt!: boolean;
|
| 17 |
+
|
| 18 |
+
@Prop({
|
| 19 |
+
default: false,
|
| 20 |
+
})
|
| 21 |
+
withLinksSummary!: boolean;
|
| 22 |
+
|
| 23 |
+
@Prop({
|
| 24 |
+
default: false,
|
| 25 |
+
})
|
| 26 |
+
withImagesSummary!: boolean;
|
| 27 |
+
|
| 28 |
+
@Prop({
|
| 29 |
+
default: false,
|
| 30 |
+
})
|
| 31 |
+
noCache!: boolean;
|
| 32 |
+
|
| 33 |
+
@Prop()
|
| 34 |
+
cacheTolerance?: number;
|
| 35 |
+
|
| 36 |
+
@Prop()
|
| 37 |
+
targetSelector?: string;
|
| 38 |
+
|
| 39 |
+
@Prop()
|
| 40 |
+
waitForSelector?: string;
|
| 41 |
+
|
| 42 |
+
@Prop({
|
| 43 |
+
arrayOf: String,
|
| 44 |
+
})
|
| 45 |
+
setCookies?: CookieParam[];
|
| 46 |
+
|
| 47 |
+
@Prop()
|
| 48 |
+
proxyUrl?: string;
|
| 49 |
+
|
| 50 |
+
static override from(input: any) {
|
| 51 |
+
const instance = super.from(input) as CrawlerOptions;
|
| 52 |
+
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
| 53 |
+
req: Request,
|
| 54 |
+
res: Response,
|
| 55 |
+
};
|
| 56 |
+
|
| 57 |
+
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format');
|
| 58 |
+
if (customMode !== undefined) {
|
| 59 |
+
instance.respondWith = customMode;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
const withGeneratedAlt = ctx.req.get('x-with-generated-alt');
|
| 63 |
+
if (withGeneratedAlt !== undefined) {
|
| 64 |
+
instance.withGeneratedAlt = Boolean(withGeneratedAlt);
|
| 65 |
+
}
|
| 66 |
+
const withLinksSummary = ctx.req.get('x-with-links-summary');
|
| 67 |
+
if (withLinksSummary !== undefined) {
|
| 68 |
+
instance.withLinksSummary = Boolean(withLinksSummary);
|
| 69 |
+
}
|
| 70 |
+
const withImagesSummary = ctx.req.get('x-with-images-summary');
|
| 71 |
+
if (withImagesSummary !== undefined) {
|
| 72 |
+
instance.withImagesSummary = Boolean(withImagesSummary);
|
| 73 |
+
}
|
| 74 |
+
const noCache = ctx.req.get('x-no-cache');
|
| 75 |
+
if (noCache !== undefined) {
|
| 76 |
+
instance.noCache = Boolean(noCache);
|
| 77 |
+
if (instance.noCache && instance.cacheTolerance === undefined) {
|
| 78 |
+
instance.cacheTolerance = 0;
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '');
|
| 82 |
+
if (!isNaN(cacheTolerance)) {
|
| 83 |
+
instance.cacheTolerance = cacheTolerance;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
const targetSelector = ctx.req.get('x-target-selector');
|
| 87 |
+
instance.targetSelector ??= targetSelector;
|
| 88 |
+
const waitForSelector = ctx.req.get('x-wait-for-selector');
|
| 89 |
+
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
| 90 |
+
|
| 91 |
+
const cookies: CookieParam[] = [];
|
| 92 |
+
const setCookieHeaders = ctx.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
|
| 93 |
+
if (Array.isArray(setCookieHeaders)) {
|
| 94 |
+
for (const setCookie of setCookieHeaders) {
|
| 95 |
+
cookies.push({
|
| 96 |
+
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
|
| 97 |
+
});
|
| 98 |
+
}
|
| 99 |
+
} else if (setCookieHeaders && typeof setCookieHeaders === 'string') {
|
| 100 |
+
cookies.push({
|
| 101 |
+
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
| 102 |
+
});
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
const proxyUrl = ctx.req.get('x-proxy-url');
|
| 106 |
+
instance.proxyUrl ??= proxyUrl;
|
| 107 |
+
|
| 108 |
+
return instance;
|
| 109 |
+
}
|
| 110 |
+
}
|
backend/functions/src/services/alt-text.ts
CHANGED
|
@@ -6,7 +6,6 @@ import { ImageInterrogationManager } from '../shared/services/common-iminterroga
|
|
| 6 |
import { ImgBrief } from './puppeteer';
|
| 7 |
import { ImgAlt } from '../db/img-alt';
|
| 8 |
|
| 9 |
-
|
| 10 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 11 |
|
| 12 |
@singleton()
|
|
|
|
| 6 |
import { ImgBrief } from './puppeteer';
|
| 7 |
import { ImgAlt } from '../db/img-alt';
|
| 8 |
|
|
|
|
| 9 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 10 |
|
| 11 |
@singleton()
|
backend/functions/src/services/pdf-extract.ts
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import 'core-js/actual/promise/with-resolvers';
|
| 2 |
+
import { singleton } from 'tsyringe';
|
| 3 |
+
import _ from 'lodash';
|
| 4 |
+
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
| 5 |
+
import { AsyncService, HashManager } from 'civkit';
|
| 6 |
+
import { Logger } from '../shared/services/logger';
|
| 7 |
+
import { PDFContent } from '../db/pdf';
|
| 8 |
+
import dayjs from 'dayjs';
|
| 9 |
+
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
| 10 |
+
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
| 11 |
+
const timezone = require('dayjs/plugin/timezone');
|
| 12 |
+
dayjs.extend(timezone);
|
| 13 |
+
|
| 14 |
+
const pPdfjs = import('pdfjs-dist');
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
const md5Hasher = new HashManager('md5', 'hex');
|
| 18 |
+
|
| 19 |
+
function stdDev(numbers: number[]) {
|
| 20 |
+
const mean = _.mean(numbers);
|
| 21 |
+
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
|
| 22 |
+
const avgSquareDiff = _.mean(squareDiffs);
|
| 23 |
+
return Math.sqrt(avgSquareDiff);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
function isRotatedByAtLeast35Degrees(transform: [number, number, number, number, number, number]): boolean {
|
| 27 |
+
const [a, b, c, d, _e, _f] = transform;
|
| 28 |
+
|
| 29 |
+
// Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
|
| 30 |
+
const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b
|
| 31 |
+
const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d
|
| 32 |
+
|
| 33 |
+
// Either angle1 or angle2 can be used to determine the rotation, they should be equivalent
|
| 34 |
+
const rotationAngle1 = Math.abs(angle1);
|
| 35 |
+
const rotationAngle2 = Math.abs(angle2);
|
| 36 |
+
|
| 37 |
+
// Check if the absolute rotation angle is greater than or equal to 35 degrees
|
| 38 |
+
return rotationAngle1 >= 35 || rotationAngle2 >= 35;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
@singleton()
|
| 42 |
+
export class PDFExtractor extends AsyncService {
|
| 43 |
+
|
| 44 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 45 |
+
pdfjs!: Awaited<typeof pPdfjs>;
|
| 46 |
+
|
| 47 |
+
constructor(
|
| 48 |
+
protected globalLogger: Logger,
|
| 49 |
+
) {
|
| 50 |
+
super(...arguments);
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
override async init() {
|
| 54 |
+
await this.dependencyReady();
|
| 55 |
+
this.pdfjs = await pPdfjs;
|
| 56 |
+
|
| 57 |
+
this.emit('ready');
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
async extract(url: string | URL) {
|
| 61 |
+
const loadingTask = this.pdfjs.getDocument({
|
| 62 |
+
url,
|
| 63 |
+
disableFontFace: true,
|
| 64 |
+
verbosity: 0
|
| 65 |
+
});
|
| 66 |
+
|
| 67 |
+
const doc = await loadingTask.promise;
|
| 68 |
+
const meta = await doc.getMetadata();
|
| 69 |
+
|
| 70 |
+
const textItems: TextItem[][] = [];
|
| 71 |
+
|
| 72 |
+
for (const pg of _.range(0, doc.numPages)) {
|
| 73 |
+
const page = await doc.getPage(pg + 1);
|
| 74 |
+
const textContent = await page.getTextContent();
|
| 75 |
+
textItems.push((textContent.items as TextItem[]));
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
const articleCharHeights: number[] = [];
|
| 79 |
+
for (const textItem of textItems.flat()) {
|
| 80 |
+
if (textItem.height) {
|
| 81 |
+
articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height));
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
const articleAvgHeight = _.mean(articleCharHeights);
|
| 85 |
+
const articleStdDevHeight = stdDev(articleCharHeights);
|
| 86 |
+
// const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)];
|
| 87 |
+
const mdOps: Array<{
|
| 88 |
+
text: string;
|
| 89 |
+
op?: 'new' | 'append';
|
| 90 |
+
mode: 'h1' | 'h2' | 'p' | 'appendix' | 'space';
|
| 91 |
+
}> = [];
|
| 92 |
+
|
| 93 |
+
const rawChunks: string[] = [];
|
| 94 |
+
|
| 95 |
+
let op: 'append' | 'new' = 'new';
|
| 96 |
+
let mode: 'h1' | 'h2' | 'p' | 'space' | 'appendix' = 'p';
|
| 97 |
+
for (const pageTextItems of textItems) {
|
| 98 |
+
const charHeights = [];
|
| 99 |
+
for (const textItem of pageTextItems as TextItem[]) {
|
| 100 |
+
if (textItem.height) {
|
| 101 |
+
charHeights.push(...Array(textItem.str.length).fill(textItem.height));
|
| 102 |
+
}
|
| 103 |
+
rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`);
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
const avgHeight = _.mean(charHeights);
|
| 107 |
+
const stdDevHeight = stdDev(charHeights);
|
| 108 |
+
// const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)];
|
| 109 |
+
|
| 110 |
+
for (const textItem of pageTextItems) {
|
| 111 |
+
if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) {
|
| 112 |
+
mode = 'h1';
|
| 113 |
+
} else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) {
|
| 114 |
+
mode = 'h2';
|
| 115 |
+
} else if (textItem.height && textItem.height < avgHeight - stdDevHeight) {
|
| 116 |
+
mode = 'appendix';
|
| 117 |
+
} else if (textItem.height) {
|
| 118 |
+
mode = 'p';
|
| 119 |
+
} else {
|
| 120 |
+
mode = 'space';
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
if (isRotatedByAtLeast35Degrees(textItem.transform as any)) {
|
| 124 |
+
mode = 'appendix';
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
mdOps.push({
|
| 128 |
+
op,
|
| 129 |
+
mode,
|
| 130 |
+
text: textItem.str
|
| 131 |
+
});
|
| 132 |
+
|
| 133 |
+
if (textItem.hasEOL && !textItem.str) {
|
| 134 |
+
op = 'new';
|
| 135 |
+
} else {
|
| 136 |
+
op = 'append';
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
const mdChunks = [];
|
| 142 |
+
const appendixChunks = [];
|
| 143 |
+
mode = 'space';
|
| 144 |
+
for (const x of mdOps) {
|
| 145 |
+
const previousMode: string = mode;
|
| 146 |
+
const changeToMdChunks = [];
|
| 147 |
+
|
| 148 |
+
const isNewStart = x.mode !== 'space' && (x.op === 'new' || (previousMode === 'appendix' && x.mode !== previousMode));
|
| 149 |
+
|
| 150 |
+
if (isNewStart) {
|
| 151 |
+
switch (x.mode) {
|
| 152 |
+
case 'h1': {
|
| 153 |
+
changeToMdChunks.push(`\n\n# `);
|
| 154 |
+
mode = x.mode;
|
| 155 |
+
break;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
case 'h2': {
|
| 159 |
+
changeToMdChunks.push(`\n\n## `);
|
| 160 |
+
mode = x.mode;
|
| 161 |
+
break;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
case 'p': {
|
| 165 |
+
changeToMdChunks.push(`\n\n`);
|
| 166 |
+
mode = x.mode;
|
| 167 |
+
break;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
case 'appendix': {
|
| 171 |
+
mode = x.mode;
|
| 172 |
+
appendixChunks.push(`\n\n`);
|
| 173 |
+
break;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
default: {
|
| 177 |
+
break;
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
} else {
|
| 181 |
+
if (x.mode === 'appendix' && appendixChunks.length) {
|
| 182 |
+
const lastChunk = appendixChunks[appendixChunks.length - 1];
|
| 183 |
+
if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
|
| 184 |
+
appendixChunks.push(' ');
|
| 185 |
+
}
|
| 186 |
+
} else if (mdChunks.length) {
|
| 187 |
+
const lastChunk = mdChunks[mdChunks.length - 1];
|
| 188 |
+
if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
|
| 189 |
+
changeToMdChunks.push(' ');
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
if (x.text) {
|
| 195 |
+
if (x.mode == 'appendix') {
|
| 196 |
+
if (appendixChunks.length || isNewStart) {
|
| 197 |
+
appendixChunks.push(x.text);
|
| 198 |
+
} else {
|
| 199 |
+
changeToMdChunks.push(x.text);
|
| 200 |
+
}
|
| 201 |
+
} else {
|
| 202 |
+
changeToMdChunks.push(x.text);
|
| 203 |
+
}
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) {
|
| 207 |
+
const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n');
|
| 208 |
+
changeToMdChunks.unshift(appendix);
|
| 209 |
+
changeToMdChunks.unshift(`\n\n`);
|
| 210 |
+
appendixChunks.length = 0;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
if (x.mode === 'space' && changeToMdChunks.length) {
|
| 214 |
+
changeToMdChunks.length = 1;
|
| 215 |
+
}
|
| 216 |
+
if (changeToMdChunks.length) {
|
| 217 |
+
mdChunks.push(...changeToMdChunks);
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
if (mdChunks.length) {
|
| 222 |
+
mdChunks[0] = mdChunks[0].trimStart();
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
async cachedExtract(url: string | URL) {
|
| 229 |
+
if (!url) {
|
| 230 |
+
return undefined;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
const digest = md5Hasher.hash(url.toString());
|
| 234 |
+
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
| 235 |
+
|
| 236 |
+
const existing = await PDFContent.fromFirestore(shortDigest);
|
| 237 |
+
|
| 238 |
+
if (existing) {
|
| 239 |
+
return {
|
| 240 |
+
meta: existing.meta,
|
| 241 |
+
content: existing.content,
|
| 242 |
+
text: existing.text
|
| 243 |
+
};
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
let extracted;
|
| 247 |
+
|
| 248 |
+
try {
|
| 249 |
+
extracted = await this.extract(url);
|
| 250 |
+
} catch (err) {
|
| 251 |
+
this.logger.warn(`Unable to extract from pdf ${url}`, { err });
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
// Don't try again until the next day
|
| 255 |
+
const expireMixin = extracted ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
|
| 256 |
+
|
| 257 |
+
await PDFContent.COLLECTION.doc(shortDigest).set(
|
| 258 |
+
{
|
| 259 |
+
_id: shortDigest,
|
| 260 |
+
src: url.toString(),
|
| 261 |
+
meta: extracted?.meta || {},
|
| 262 |
+
content: extracted?.content || '',
|
| 263 |
+
text: extracted?.text || '',
|
| 264 |
+
urlDigest: digest,
|
| 265 |
+
createdAt: new Date(),
|
| 266 |
+
...expireMixin
|
| 267 |
+
}, { merge: true }
|
| 268 |
+
);
|
| 269 |
+
|
| 270 |
+
return extracted;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
parsePdfDate(pdfDate: string | undefined) {
|
| 274 |
+
if (!pdfDate) {
|
| 275 |
+
return undefined;
|
| 276 |
+
}
|
| 277 |
+
// Remove the 'D:' prefix
|
| 278 |
+
const cleanedDate = pdfDate.slice(2);
|
| 279 |
+
|
| 280 |
+
// Define the format without the timezone part first
|
| 281 |
+
const dateTimePart = cleanedDate.slice(0, 14);
|
| 282 |
+
const timezonePart = cleanedDate.slice(14);
|
| 283 |
+
|
| 284 |
+
// Construct the full date string in a standard format
|
| 285 |
+
const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`;
|
| 286 |
+
|
| 287 |
+
// Parse the date with timezone
|
| 288 |
+
const parsedDate = dayjs(formattedDate, "YYYYMMDDHHmmssZ");
|
| 289 |
+
|
| 290 |
+
const date = parsedDate.toDate();
|
| 291 |
+
|
| 292 |
+
if (!date.valueOf()) {
|
| 293 |
+
return undefined;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
return date;
|
| 297 |
+
}
|
| 298 |
+
}
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -50,6 +50,7 @@ export interface PageSnapshot {
|
|
| 50 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 51 |
screenshot?: Buffer;
|
| 52 |
imgs?: ImgBrief[];
|
|
|
|
| 53 |
}
|
| 54 |
|
| 55 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
@@ -62,6 +63,7 @@ export interface ScrappingOptions {
|
|
| 62 |
cookies?: CookieParam[];
|
| 63 |
favorScreenshot?: boolean;
|
| 64 |
waitForSelector?: string;
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
|
|
@@ -97,7 +99,9 @@ export class PuppeteerControl extends AsyncService {
|
|
| 97 |
livePages = new Set<Page>();
|
| 98 |
lastPageCratedAt: number = 0;
|
| 99 |
|
| 100 |
-
constructor(
|
|
|
|
|
|
|
| 101 |
super(...arguments);
|
| 102 |
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
| 103 |
|
|
@@ -219,7 +223,17 @@ function briefImgs(elem) {
|
|
| 219 |
};
|
| 220 |
});
|
| 221 |
}
|
| 222 |
-
function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
let parsed;
|
| 224 |
try {
|
| 225 |
parsed = new Readability(document.cloneNode(true)).parse();
|
|
@@ -234,6 +248,7 @@ function giveSnapshot() {
|
|
| 234 |
text: document.body?.innerText,
|
| 235 |
parsed: parsed,
|
| 236 |
imgs: [],
|
|
|
|
| 237 |
};
|
| 238 |
if (parsed && parsed.content) {
|
| 239 |
const elem = document.createElement('div');
|
|
@@ -277,7 +292,7 @@ function giveSnapshot() {
|
|
| 277 |
});
|
| 278 |
|
| 279 |
await page.evaluateOnNewDocument(`
|
| 280 |
-
let
|
| 281 |
const handlePageLoad = () => {
|
| 282 |
if (window.haltSnapshot) {
|
| 283 |
return;
|
|
@@ -285,26 +300,23 @@ const handlePageLoad = () => {
|
|
| 285 |
if (document.readyState !== 'complete') {
|
| 286 |
return;
|
| 287 |
}
|
| 288 |
-
const
|
| 289 |
-
|
| 290 |
-
if (
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
}
|
| 294 |
-
aftershot = setTimeout(() => {
|
| 295 |
-
const r = giveSnapshot();
|
| 296 |
-
if (r && r.text) {
|
| 297 |
-
window.reportSnapshot(r);
|
| 298 |
-
}
|
| 299 |
-
}, 500);
|
| 300 |
}
|
|
|
|
|
|
|
|
|
|
| 301 |
};
|
|
|
|
| 302 |
document.addEventListener('readystatechange', handlePageLoad);
|
| 303 |
document.addEventListener('load', handlePageLoad);
|
| 304 |
`);
|
| 305 |
|
| 306 |
this.snMap.set(page, sn);
|
| 307 |
-
this.logger.
|
| 308 |
this.lastPageCratedAt = Date.now();
|
| 309 |
this.livePages.add(page);
|
| 310 |
|
|
@@ -409,12 +421,12 @@ document.addEventListener('load', handlePageLoad);
|
|
| 409 |
finalized = true;
|
| 410 |
return;
|
| 411 |
}
|
| 412 |
-
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 413 |
screenshot = await page.screenshot();
|
| 414 |
-
if (!snapshot.title || !snapshot.parsed?.content) {
|
| 415 |
const salvaged = await this.salvage(url, page);
|
| 416 |
if (salvaged) {
|
| 417 |
-
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 418 |
screenshot = await page.screenshot();
|
| 419 |
}
|
| 420 |
}
|
|
@@ -429,7 +441,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 429 |
if (options?.waitForSelector) {
|
| 430 |
page.waitForSelector(options.waitForSelector)
|
| 431 |
.then(async () => {
|
| 432 |
-
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 433 |
screenshot = await page.screenshot();
|
| 434 |
finalized = true;
|
| 435 |
nextSnapshotDeferred.resolve(snapshot);
|
|
@@ -442,7 +454,11 @@ document.addEventListener('load', handlePageLoad);
|
|
| 442 |
try {
|
| 443 |
let lastHTML = snapshot?.html;
|
| 444 |
while (true) {
|
| 445 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
if (finalized) {
|
| 447 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 448 |
break;
|
|
|
|
| 50 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 51 |
screenshot?: Buffer;
|
| 52 |
imgs?: ImgBrief[];
|
| 53 |
+
pdfs?: string[];
|
| 54 |
}
|
| 55 |
|
| 56 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
|
|
| 63 |
cookies?: CookieParam[];
|
| 64 |
favorScreenshot?: boolean;
|
| 65 |
waitForSelector?: string;
|
| 66 |
+
minIntervalMs?: number;
|
| 67 |
}
|
| 68 |
|
| 69 |
|
|
|
|
| 99 |
livePages = new Set<Page>();
|
| 100 |
lastPageCratedAt: number = 0;
|
| 101 |
|
| 102 |
+
constructor(
|
| 103 |
+
protected globalLogger: Logger,
|
| 104 |
+
) {
|
| 105 |
super(...arguments);
|
| 106 |
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
| 107 |
|
|
|
|
| 223 |
};
|
| 224 |
});
|
| 225 |
}
|
| 226 |
+
function briefPDFs() {
|
| 227 |
+
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
| 228 |
+
|
| 229 |
+
return pdfTags.map((x)=> {
|
| 230 |
+
return x.src === 'about:blank' ? document.location.href : x.src;
|
| 231 |
+
});
|
| 232 |
+
}
|
| 233 |
+
function giveSnapshot(stopActiveSnapshot) {
|
| 234 |
+
if (stopActiveSnapshot) {
|
| 235 |
+
window.haltSnapshot = true;
|
| 236 |
+
}
|
| 237 |
let parsed;
|
| 238 |
try {
|
| 239 |
parsed = new Readability(document.cloneNode(true)).parse();
|
|
|
|
| 248 |
text: document.body?.innerText,
|
| 249 |
parsed: parsed,
|
| 250 |
imgs: [],
|
| 251 |
+
pdfs: briefPDFs(),
|
| 252 |
};
|
| 253 |
if (parsed && parsed.content) {
|
| 254 |
const elem = document.createElement('div');
|
|
|
|
| 292 |
});
|
| 293 |
|
| 294 |
await page.evaluateOnNewDocument(`
|
| 295 |
+
let lastTextLength = 0;
|
| 296 |
const handlePageLoad = () => {
|
| 297 |
if (window.haltSnapshot) {
|
| 298 |
return;
|
|
|
|
| 300 |
if (document.readyState !== 'complete') {
|
| 301 |
return;
|
| 302 |
}
|
| 303 |
+
const thisTextLength = (document.body.innerText || '').length;
|
| 304 |
+
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
| 305 |
+
if (10 * deltaLength < lastTextLength) {
|
| 306 |
+
// Change is not significant
|
| 307 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
}
|
| 309 |
+
const r = giveSnapshot();
|
| 310 |
+
window.reportSnapshot(r);
|
| 311 |
+
lastTextLength = thisTextLength;
|
| 312 |
};
|
| 313 |
+
setInterval(handlePageLoad, 500);
|
| 314 |
document.addEventListener('readystatechange', handlePageLoad);
|
| 315 |
document.addEventListener('load', handlePageLoad);
|
| 316 |
`);
|
| 317 |
|
| 318 |
this.snMap.set(page, sn);
|
| 319 |
+
this.logger.info(`Page ${sn} created.`);
|
| 320 |
this.lastPageCratedAt = Date.now();
|
| 321 |
this.livePages.add(page);
|
| 322 |
|
|
|
|
| 421 |
finalized = true;
|
| 422 |
return;
|
| 423 |
}
|
| 424 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 425 |
screenshot = await page.screenshot();
|
| 426 |
+
if ((!snapshot.title || !snapshot.parsed?.content) && !(snapshot.pdfs?.length)) {
|
| 427 |
const salvaged = await this.salvage(url, page);
|
| 428 |
if (salvaged) {
|
| 429 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 430 |
screenshot = await page.screenshot();
|
| 431 |
}
|
| 432 |
}
|
|
|
|
| 441 |
if (options?.waitForSelector) {
|
| 442 |
page.waitForSelector(options.waitForSelector)
|
| 443 |
.then(async () => {
|
| 444 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 445 |
screenshot = await page.screenshot();
|
| 446 |
finalized = true;
|
| 447 |
nextSnapshotDeferred.resolve(snapshot);
|
|
|
|
| 454 |
try {
|
| 455 |
let lastHTML = snapshot?.html;
|
| 456 |
while (true) {
|
| 457 |
+
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
| 458 |
+
if (options?.minIntervalMs) {
|
| 459 |
+
ckpt.push(delay(options.minIntervalMs));
|
| 460 |
+
}
|
| 461 |
+
await Promise.race(ckpt);
|
| 462 |
if (finalized) {
|
| 463 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 464 |
break;
|
backend/functions/tsconfig.json
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"compilerOptions": {
|
| 3 |
-
"module": "
|
|
|
|
| 4 |
"noImplicitReturns": true,
|
| 5 |
"noUnusedLocals": true,
|
| 6 |
"outDir": "build",
|
|
|
|
| 1 |
{
|
| 2 |
"compilerOptions": {
|
| 3 |
+
"module": "node16",
|
| 4 |
+
|
| 5 |
"noImplicitReturns": true,
|
| 6 |
"noUnusedLocals": true,
|
| 7 |
"outDir": "build",
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit b0b597800a36e2aa8ee3d52715aa7c998b388f47
|