nomagick commited on
Commit
33e14e5
·
unverified ·
1 Parent(s): 7c57123

feat: extract text from pdf (#70)

Browse files

* feat: pdf

* fix

* fix

backend/firebase.json CHANGED
@@ -33,9 +33,6 @@
33
  "functions": {
34
  "port": 5001
35
  },
36
- "auth": {
37
- "port": 9099
38
- },
39
  "firestore": {
40
  "port": 9098
41
  },
 
33
  "functions": {
34
  "port": 5001
35
  },
 
 
 
36
  "firestore": {
37
  "port": 9098
38
  },
backend/functions/package-lock.json CHANGED
@@ -15,6 +15,7 @@
15
  "axios": "^1.3.3",
16
  "bcrypt": "^5.1.0",
17
  "civkit": "^0.6.5-047c0d8",
 
18
  "cors": "^2.8.5",
19
  "dayjs": "^1.11.9",
20
  "express": "^4.19.2",
@@ -27,6 +28,7 @@
27
  "maxmind": "^4.3.18",
28
  "minio": "^7.1.3",
29
  "openai": "^4.20.0",
 
30
  "puppeteer": "^22.7.1",
31
  "puppeteer-extra": "^3.3.6",
32
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
@@ -3923,6 +3925,16 @@
3923
  "integrity": "sha512-3DdaFaU/Zf1AnpLiFDeNCD4TOWe3Zl2RZaTzUvWiIk5ERzcCodOE20Vqq4fzCbNoHURFHT4/us/Lfq+S2zyY4w==",
3924
  "optional": true
3925
  },
 
 
 
 
 
 
 
 
 
 
3926
  "node_modules/core-util-is": {
3927
  "version": "1.0.3",
3928
  "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
@@ -9208,6 +9220,18 @@
9208
  "node": ">=8"
9209
  }
9210
  },
 
 
 
 
 
 
 
 
 
 
 
 
9211
  "node_modules/pend": {
9212
  "version": "1.2.0",
9213
  "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
 
15
  "axios": "^1.3.3",
16
  "bcrypt": "^5.1.0",
17
  "civkit": "^0.6.5-047c0d8",
18
+ "core-js": "^3.37.1",
19
  "cors": "^2.8.5",
20
  "dayjs": "^1.11.9",
21
  "express": "^4.19.2",
 
28
  "maxmind": "^4.3.18",
29
  "minio": "^7.1.3",
30
  "openai": "^4.20.0",
31
+ "pdfjs-dist": "^4.2.67",
32
  "puppeteer": "^22.7.1",
33
  "puppeteer-extra": "^3.3.6",
34
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
 
3925
  "integrity": "sha512-3DdaFaU/Zf1AnpLiFDeNCD4TOWe3Zl2RZaTzUvWiIk5ERzcCodOE20Vqq4fzCbNoHURFHT4/us/Lfq+S2zyY4w==",
3926
  "optional": true
3927
  },
3928
+ "node_modules/core-js": {
3929
+ "version": "3.37.1",
3930
+ "resolved": "https://registry.npmjs.org/core-js/-/core-js-3.37.1.tgz",
3931
+ "integrity": "sha512-Xn6qmxrQZyB0FFY8E3bgRXei3lWDJHhvI+u0q9TKIYM49G8pAr0FgnnrFRAmsbptZL1yxRADVXn+x5AGsbBfyw==",
3932
+ "hasInstallScript": true,
3933
+ "funding": {
3934
+ "type": "opencollective",
3935
+ "url": "https://opencollective.com/core-js"
3936
+ }
3937
+ },
3938
  "node_modules/core-util-is": {
3939
  "version": "1.0.3",
3940
  "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
 
9220
  "node": ">=8"
9221
  }
9222
  },
9223
+ "node_modules/pdfjs-dist": {
9224
+ "version": "4.2.67",
9225
+ "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz",
9226
+ "integrity": "sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==",
9227
+ "engines": {
9228
+ "node": ">=18"
9229
+ },
9230
+ "optionalDependencies": {
9231
+ "canvas": "^2.11.2",
9232
+ "path2d": "^0.2.0"
9233
+ }
9234
+ },
9235
  "node_modules/pend": {
9236
  "version": "1.2.0",
9237
  "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
backend/functions/package.json CHANGED
@@ -35,6 +35,7 @@
35
  "axios": "^1.3.3",
36
  "bcrypt": "^5.1.0",
37
  "civkit": "^0.6.5-047c0d8",
 
38
  "cors": "^2.8.5",
39
  "dayjs": "^1.11.9",
40
  "express": "^4.19.2",
@@ -47,6 +48,7 @@
47
  "maxmind": "^4.3.18",
48
  "minio": "^7.1.3",
49
  "openai": "^4.20.0",
 
50
  "puppeteer": "^22.7.1",
51
  "puppeteer-extra": "^3.3.6",
52
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
 
35
  "axios": "^1.3.3",
36
  "bcrypt": "^5.1.0",
37
  "civkit": "^0.6.5-047c0d8",
38
+ "core-js": "^3.37.1",
39
  "cors": "^2.8.5",
40
  "dayjs": "^1.11.9",
41
  "express": "^4.19.2",
 
48
  "maxmind": "^4.3.18",
49
  "minio": "^7.1.3",
50
  "openai": "^4.20.0",
51
+ "pdfjs-dist": "^4.2.67",
52
  "puppeteer": "^22.7.1",
53
  "puppeteer-extra": "^3.3.6",
54
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -10,18 +10,18 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
12
  import { Request, Response } from 'express';
13
- import normalizeUrl from "@esm2cjs/normalize-url";
14
  import { AltTextService } from '../services/alt-text';
15
  import TurndownService from 'turndown';
16
- import { parseString as parseSetCookieString } from 'set-cookie-parser';
17
- import type { CookieParam } from 'puppeteer';
18
  import { Crawled } from '../db/crawled';
19
  import { cleanAttribute } from '../utils/misc';
20
  import { randomUUID } from 'crypto';
21
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
22
 
23
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
 
24
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 
25
 
26
  const md5Hasher = new HashManager('md5', 'hex');
27
 
@@ -69,6 +69,7 @@ export class CrawlerHost extends RPCHost {
69
  protected globalLogger: Logger,
70
  protected puppeteerControl: PuppeteerControl,
71
  protected altTextService: AltTextService,
 
72
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
73
  protected rateLimitControl: RateLimitControl,
74
  protected threadLocal: AsyncContext,
@@ -76,7 +77,7 @@ export class CrawlerHost extends RPCHost {
76
  super(...arguments);
77
 
78
  puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
79
- if (!snapshot.title?.trim()) {
80
  return;
81
  }
82
  if (options.cookies?.length) {
@@ -138,14 +139,14 @@ export class CrawlerHost extends RPCHost {
138
  });
139
  turnDownService.addRule('improved-inline-link', {
140
  filter: function (node, options) {
141
- return (
142
  options.linkStyle === 'inlined' &&
143
  node.nodeName === 'A' &&
144
  node.getAttribute('href')
145
  );
146
  },
147
 
148
- replacement: function (content, node) {
149
  let href = node.getAttribute('href');
150
  if (href) href = href.replace(/([()])/g, '\\$1');
151
  let title = cleanAttribute(node.getAttribute('title'));
@@ -226,6 +227,26 @@ export class CrawlerHost extends RPCHost {
226
  }
227
  } as FormattedPage;
228
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  if (mode === 'text') {
230
  return {
231
  ...this.getGeneralSnapshotMixins(snapshot),
@@ -236,101 +257,108 @@ export class CrawlerHost extends RPCHost {
236
  } as FormattedPage;
237
  }
238
 
239
- const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
240
- let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
241
- for (const plugin of this.turnDownPlugins) {
242
- turnDownService = turnDownService.use(plugin);
243
- }
244
- const urlToAltMap: { [k: string]: string | undefined; } = {};
245
- if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
246
- const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
247
- const r = await this.altTextService.getAltText(x).catch((err: any) => {
248
- this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
249
- return undefined;
250
- });
251
- if (r && x.src) {
252
- urlToAltMap[x.src.trim()] = r;
253
- }
254
- });
255
-
256
- await Promise.all(tasks);
257
- }
258
- let imgIdx = 0;
259
  const imageSummary = {} as { [k: string]: string; };
260
  const imageIdxTrack = new Map<string, number[]>();
261
- turnDownService.addRule('img-generated-alt', {
262
- filter: 'img',
263
- replacement: (_content, node) => {
264
- let linkPreferredSrc = (node.getAttribute('src') || '').trim();
265
- if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
266
- const dataSrc = (node.getAttribute('data-src') || '').trim();
267
- if (dataSrc && !dataSrc.startsWith('data:')) {
268
- linkPreferredSrc = dataSrc;
 
 
 
 
 
 
 
 
 
 
 
 
269
  }
270
- }
271
 
272
- let src;
273
- try {
274
- src = new URL(linkPreferredSrc, nominalUrl).toString();
275
- } catch (_err) {
276
- void 0;
277
- }
278
- const alt = cleanAttribute(node.getAttribute('alt'));
279
- if (!src) {
280
- return '';
281
- }
282
- const mapped = urlToAltMap[src];
283
- const imgSerial = ++imgIdx;
284
- const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
285
- idxArr.push(imgSerial);
286
- imageIdxTrack.set(src, idxArr);
287
 
288
- if (mapped) {
289
- imageSummary[src] = mapped || alt;
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
- return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
292
- }
293
 
294
- imageSummary[src] = alt || '';
 
295
 
296
- return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
297
- }
298
- });
299
 
300
- let contentText = '';
301
- if (toBeTurnedToMd) {
302
- try {
303
- contentText = turnDownService.turndown(toBeTurnedToMd).trim();
304
- } catch (err) {
305
- this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
306
- const vanillaTurnDownService = this.getTurndown();
307
  try {
308
- contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
309
- } catch (err2) {
310
- this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
 
 
 
 
 
 
311
  }
312
  }
313
- }
314
 
315
- if (
316
- !contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
317
- && toBeTurnedToMd !== snapshot.html
318
- ) {
319
- try {
320
- contentText = turnDownService.turndown(snapshot.html);
321
- } catch (err) {
322
- this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
323
- const vanillaTurnDownService = this.getTurndown();
324
  try {
325
- contentText = vanillaTurnDownService.turndown(snapshot.html);
326
- } catch (err2) {
327
- this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
 
 
 
 
 
 
328
  }
329
  }
330
- }
331
- if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
332
- contentText = snapshot.text;
333
- }
334
 
335
  const cleanText = (contentText || '').trim();
336
 
@@ -514,7 +542,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
514
  req: Request,
515
  res: Response,
516
  },
517
- auth: JinaEmbeddingsAuthDTO
 
518
  ) {
519
  const uid = await auth.solveUID();
520
  let chargeAmount = 0;
@@ -571,6 +600,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
571
  }
572
 
573
  let urlToCrawl;
 
574
  try {
575
  urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
576
  } catch (err) {
@@ -586,58 +616,19 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
586
  });
587
  }
588
 
589
- const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
590
- const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
591
- const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
592
- const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
593
- const noCache = Boolean(ctx.req.get('x-no-cache'));
594
- let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
595
- if (isNaN(cacheTolerance)) {
596
- cacheTolerance = this.cacheValidMs;
597
- if (noCache) {
598
- cacheTolerance = 0;
599
- }
600
- }
601
- const targetSelector = ctx.req.get('x-target-selector') || undefined;
602
- const waitForSelector = ctx.req.get('x-wait-for-selector') || targetSelector;
603
- const cookies: CookieParam[] = [];
604
- const setCookieHeaders = ctx.req.headers['x-set-cookie'];
605
- if (Array.isArray(setCookieHeaders)) {
606
- for (const setCookie of setCookieHeaders) {
607
- cookies.push({
608
- ...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
609
- domain: urlToCrawl.hostname,
610
- });
611
- }
612
- } else if (setCookieHeaders) {
613
- cookies.push({
614
- ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
615
- domain: urlToCrawl.hostname,
616
- });
617
- }
618
- this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
619
- this.threadLocal.set('withLinksSummary', withLinksSummary);
620
- this.threadLocal.set('withImagesSummary', withImagesSummary);
621
-
622
- const crawlOpts: ExtraScrappingOptions = {
623
- proxyUrl: ctx.req.get('x-proxy-url'),
624
- cookies,
625
- favorScreenshot: customMode === 'screenshot',
626
- waitForSelector,
627
- targetSelector,
628
- };
629
 
630
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
631
  const sseStream = new OutputServerEventStream();
632
  rpcReflect.return(sseStream);
633
 
634
  try {
635
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
636
  if (!scrapped) {
637
  continue;
638
  }
639
 
640
- const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
641
  chargeAmount = this.getChargeAmount(formatted);
642
  sseStream.write({
643
  event: 'data',
@@ -659,13 +650,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
659
 
660
  let lastScrapped;
661
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
662
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
663
  lastScrapped = scrapped;
664
- if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
665
  continue;
666
  }
667
 
668
- const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
669
  chargeAmount = this.getChargeAmount(formatted);
670
 
671
  return formatted;
@@ -675,21 +666,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
675
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
676
  }
677
 
678
- const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
679
  chargeAmount = this.getChargeAmount(formatted);
680
 
681
  return formatted;
682
  }
683
 
684
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
685
  lastScrapped = scrapped;
686
- if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
687
  continue;
688
  }
689
 
690
- const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
691
  chargeAmount = this.getChargeAmount(formatted);
692
- if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
693
 
694
  return assignTransferProtocolMeta(`${formatted}`,
695
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
@@ -703,9 +694,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
703
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
704
  }
705
 
706
- const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
707
  chargeAmount = this.getChargeAmount(formatted);
708
- if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
709
 
710
  return assignTransferProtocolMeta(`${formatted}`,
711
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
@@ -915,4 +906,55 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
915
  }
916
  }
917
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
918
  }
 
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
12
  import { Request, Response } from 'express';
13
+ const pNormalizeUrl = import("@esm2cjs/normalize-url");
14
  import { AltTextService } from '../services/alt-text';
15
  import TurndownService from 'turndown';
 
 
16
  import { Crawled } from '../db/crawled';
17
  import { cleanAttribute } from '../utils/misc';
18
  import { randomUUID } from 'crypto';
19
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
20
 
21
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
22
+ import { CrawlerOptions } from '../dto/scrapping-options';
23
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
24
+ import { PDFExtractor } from '../services/pdf-extract';
25
 
26
  const md5Hasher = new HashManager('md5', 'hex');
27
 
 
69
  protected globalLogger: Logger,
70
  protected puppeteerControl: PuppeteerControl,
71
  protected altTextService: AltTextService,
72
+ protected pdfExtractor: PDFExtractor,
73
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
74
  protected rateLimitControl: RateLimitControl,
75
  protected threadLocal: AsyncContext,
 
77
  super(...arguments);
78
 
79
  puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
80
+ if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
81
  return;
82
  }
83
  if (options.cookies?.length) {
 
139
  });
140
  turnDownService.addRule('improved-inline-link', {
141
  filter: function (node, options) {
142
+ return Boolean(
143
  options.linkStyle === 'inlined' &&
144
  node.nodeName === 'A' &&
145
  node.getAttribute('href')
146
  );
147
  },
148
 
149
+ replacement: function (content, node: any) {
150
  let href = node.getAttribute('href');
151
  if (href) href = href.replace(/([()])/g, '\\$1');
152
  let title = cleanAttribute(node.getAttribute('title'));
 
227
  }
228
  } as FormattedPage;
229
  }
230
+
231
+ let pdfMode = false;
232
+ if (snapshot.pdfs?.length && !snapshot.title) {
233
+ const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0]);
234
+ if (pdf) {
235
+ pdfMode = true;
236
+ snapshot.title = pdf.meta?.Title;
237
+ snapshot.text = pdf.text || snapshot.text;
238
+ snapshot.parsed = {
239
+ content: pdf.content,
240
+ textContent: pdf.content,
241
+ length: pdf.content?.length,
242
+ byline: pdf.meta?.Author,
243
+ lang: pdf.meta?.Language || undefined,
244
+ title: pdf.meta?.Title,
245
+ publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
246
+ };
247
+ }
248
+ }
249
+
250
  if (mode === 'text') {
251
  return {
252
  ...this.getGeneralSnapshotMixins(snapshot),
 
257
  } as FormattedPage;
258
  }
259
 
260
+ let contentText = '';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  const imageSummary = {} as { [k: string]: string; };
262
  const imageIdxTrack = new Map<string, number[]>();
263
+ do {
264
+ if (pdfMode) {
265
+ contentText = snapshot.parsed?.content || snapshot.text;
266
+ break;
267
+ }
268
+
269
+ const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
270
+ let turnDownService = mode === 'markdown' ? this.getTurndown() : this.getTurndown('without any rule');
271
+ for (const plugin of this.turnDownPlugins) {
272
+ turnDownService = turnDownService.use(plugin);
273
+ }
274
+ const urlToAltMap: { [k: string]: string | undefined; } = {};
275
+ if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
276
+ const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
277
+ const r = await this.altTextService.getAltText(x).catch((err: any) => {
278
+ this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
279
+ return undefined;
280
+ });
281
+ if (r && x.src) {
282
+ urlToAltMap[x.src.trim()] = r;
283
  }
284
+ });
285
 
286
+ await Promise.all(tasks);
287
+ }
288
+ let imgIdx = 0;
289
+ turnDownService.addRule('img-generated-alt', {
290
+ filter: 'img',
291
+ replacement: (_content, node: any) => {
292
+ let linkPreferredSrc = (node.getAttribute('src') || '').trim();
293
+ if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
294
+ const dataSrc = (node.getAttribute('data-src') || '').trim();
295
+ if (dataSrc && !dataSrc.startsWith('data:')) {
296
+ linkPreferredSrc = dataSrc;
297
+ }
298
+ }
 
 
299
 
300
+ let src;
301
+ try {
302
+ src = new URL(linkPreferredSrc, nominalUrl).toString();
303
+ } catch (_err) {
304
+ void 0;
305
+ }
306
+ const alt = cleanAttribute(node.getAttribute('alt'));
307
+ if (!src) {
308
+ return '';
309
+ }
310
+ const mapped = urlToAltMap[src];
311
+ const imgSerial = ++imgIdx;
312
+ const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
313
+ idxArr.push(imgSerial);
314
+ imageIdxTrack.set(src, idxArr);
315
 
316
+ if (mapped) {
317
+ imageSummary[src] = mapped || alt;
318
 
319
+ return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
320
+ }
321
 
322
+ imageSummary[src] = alt || '';
 
 
323
 
324
+ return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
325
+ }
326
+ });
327
+
328
+ if (toBeTurnedToMd) {
 
 
329
  try {
330
+ contentText = turnDownService.turndown(toBeTurnedToMd).trim();
331
+ } catch (err) {
332
+ this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
333
+ const vanillaTurnDownService = this.getTurndown();
334
+ try {
335
+ contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
336
+ } catch (err2) {
337
+ this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
338
+ }
339
  }
340
  }
 
341
 
342
+ if (
343
+ !contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
344
+ && toBeTurnedToMd !== snapshot.html
345
+ ) {
 
 
 
 
 
346
  try {
347
+ contentText = turnDownService.turndown(snapshot.html);
348
+ } catch (err) {
349
+ this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
350
+ const vanillaTurnDownService = this.getTurndown();
351
+ try {
352
+ contentText = vanillaTurnDownService.turndown(snapshot.html);
353
+ } catch (err2) {
354
+ this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
355
+ }
356
  }
357
  }
358
+ if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
359
+ contentText = snapshot.text;
360
+ }
361
+ } while (false);
362
 
363
  const cleanText = (contentText || '').trim();
364
 
 
542
  req: Request,
543
  res: Response,
544
  },
545
+ auth: JinaEmbeddingsAuthDTO,
546
+ crawlerOptions: CrawlerOptions,
547
  ) {
548
  const uid = await auth.solveUID();
549
  let chargeAmount = 0;
 
600
  }
601
 
602
  let urlToCrawl;
603
+ const normalizeUrl = (await pNormalizeUrl).default;
604
  try {
605
  urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
606
  } catch (err) {
 
616
  });
617
  }
618
 
619
+ const crawlOpts = this.configure(crawlerOptions);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
 
621
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
622
  const sseStream = new OutputServerEventStream();
623
  rpcReflect.return(sseStream);
624
 
625
  try {
626
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
627
  if (!scrapped) {
628
  continue;
629
  }
630
 
631
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
632
  chargeAmount = this.getChargeAmount(formatted);
633
  sseStream.write({
634
  event: 'data',
 
650
 
651
  let lastScrapped;
652
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
653
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
654
  lastScrapped = scrapped;
655
+ if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
656
  continue;
657
  }
658
 
659
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
660
  chargeAmount = this.getChargeAmount(formatted);
661
 
662
  return formatted;
 
666
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
667
  }
668
 
669
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
670
  chargeAmount = this.getChargeAmount(formatted);
671
 
672
  return formatted;
673
  }
674
 
675
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
676
  lastScrapped = scrapped;
677
+ if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
678
  continue;
679
  }
680
 
681
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
682
  chargeAmount = this.getChargeAmount(formatted);
683
+ if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
684
 
685
  return assignTransferProtocolMeta(`${formatted}`,
686
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
 
694
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
695
  }
696
 
697
+ const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
698
  chargeAmount = this.getChargeAmount(formatted);
699
+ if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
700
 
701
  return assignTransferProtocolMeta(`${formatted}`,
702
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
 
906
  }
907
  }
908
 
909
+ configure(opts: CrawlerOptions) {
910
+
911
+ this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
912
+ this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
913
+ this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
914
+
915
+ const crawlOpts: ExtraScrappingOptions = {
916
+ proxyUrl: opts.proxyUrl,
917
+ cookies: opts.setCookies,
918
+ favorScreenshot: opts.respondWith === 'screenshot',
919
+ waitForSelector: opts.waitForSelector,
920
+ targetSelector: opts.targetSelector,
921
+ };
922
+
923
+ return crawlOpts;
924
+ }
925
+
926
+ async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
927
+ const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
928
+
929
+ let lastSnapshot;
930
+ let goodEnough = false;
931
+ try {
932
+ for await (const x of it) {
933
+ lastSnapshot = x;
934
+
935
+ if (goodEnough) {
936
+ break;
937
+ }
938
+
939
+ if (lastSnapshot?.parsed?.content) {
940
+ // After it's good enough, wait for next snapshot;
941
+ goodEnough = true;
942
+ }
943
+ }
944
+
945
+ } catch (err) {
946
+ if (lastSnapshot) {
947
+ return this.formatSnapshot(mode, lastSnapshot, url);
948
+ }
949
+
950
+ throw err;
951
+ }
952
+
953
+ if (!lastSnapshot) {
954
+ throw new AssertionFailureError(`No content available`);
955
+ }
956
+
957
+ return this.formatSnapshot(mode, lastSnapshot, url);
958
+ }
959
+
960
  }
backend/functions/src/cloud-functions/data-crunching.ts ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ Defer,
3
+ PromiseThrottle,
4
+ RPCHost,
5
+ RPCReflection,
6
+ } from 'civkit';
7
+ import { singleton } from 'tsyringe';
8
+ import { CloudScheduleV2, CloudTaskV2, FirebaseStorageBucketControl, Logger, OutputServerEventStream, Param, RPCReflect, TempFileManager } from '../shared';
9
+ import _ from 'lodash';
10
+ import { CrawlerHost } from './crawler';
11
+
12
+ import { Crawled } from '../db/crawled';
13
+ import dayjs from 'dayjs';
14
+ import { createReadStream } from 'fs';
15
+ import { appendFile } from 'fs/promises';
16
+ import { createGzip } from 'zlib';
17
+ import { getFunctions } from 'firebase-admin/functions';
18
+ import { GoogleAuth } from 'google-auth-library';
19
+
20
+ dayjs.extend(require('dayjs/plugin/utc'));
21
+
22
+ /**
23
+ * Get the URL of a given v2 cloud function.
24
+ *
25
+ * @param {string} name the function's name
26
+ * @param {string} location the function's location
27
+ * @return {Promise<string>} The URL of the function
28
+ */
29
+ async function getFunctionUrl(name: string, location = "us-central1") {
30
+ const projectId = `reader-6b7dc`;
31
+ const url = "https://cloudfunctions.googleapis.com/v2beta/" +
32
+ `projects/${projectId}/locations/${location}/functions/${name}`;
33
+ const auth = new GoogleAuth({
34
+ scopes: 'https://www.googleapis.com/auth/cloud-platform',
35
+ });
36
+ const client = await auth.getClient();
37
+ const res = await client.request<any>({ url });
38
+ const uri = res.data?.serviceConfig?.uri;
39
+ if (!uri) {
40
+ throw new Error(`Unable to retreive uri for function at ${url}`);
41
+ }
42
+ return uri;
43
+ }
44
+
45
+ @singleton()
46
+ export class DataCrunchingHost extends RPCHost {
47
+ logger = this.globalLogger.child({ service: this.constructor.name });
48
+
49
+ pageCacheCrunchingPrefix = 'crunched-pages';
50
+ pageCacheCrunchingBatchSize = 5000;
51
+ pageCacheCrunchingTMinus = 6 * 24 * 60 * 60 * 1000;
52
+ rev = 7;
53
+
54
+ constructor(
55
+ protected globalLogger: Logger,
56
+
57
+ protected crawler: CrawlerHost,
58
+ protected tempFileManager: TempFileManager,
59
+ protected firebaseObjectStorage: FirebaseStorageBucketControl,
60
+ ) {
61
+ super(..._.without(arguments, crawler));
62
+ }
63
+
64
+ override async init() {
65
+ await this.dependencyReady();
66
+
67
+ this.emit('ready');
68
+ }
69
+
70
+ @CloudTaskV2({
71
+ runtime: {
72
+ cpu: 2,
73
+ memory: '4GiB',
74
+ timeoutSeconds: 3600,
75
+ concurrency: 2,
76
+ maxInstances: 200,
77
+ retryConfig: {
78
+ maxAttempts: 3,
79
+ minBackoffSeconds: 60,
80
+ },
81
+ rateLimits: {
82
+ maxConcurrentDispatches: 150,
83
+ maxDispatchesPerSecond: 2,
84
+ },
85
+ },
86
+ tags: ['DataCrunching'],
87
+ })
88
+ async crunchPageCacheWorker(
89
+ @Param('date') date: string,
90
+ @Param('offset', { default: 0 }) offset: number
91
+ ) {
92
+ this.logger.info(`Crunching page cache @${date}+${offset}...`);
93
+ for await (const { fileName, records } of this.iterPageCacheRecords(date, offset)) {
94
+ this.logger.info(`Crunching ${fileName}...`);
95
+ const fileOnDrive = await this.crunchCacheRecords(records);
96
+ const fstream = createReadStream(fileOnDrive.path);
97
+ const gzipStream = createGzip();
98
+ fstream.pipe(gzipStream, { end: true });
99
+ await this.firebaseObjectStorage.bucket.file(fileName).save(gzipStream, {
100
+ contentType: 'application/jsonl+gzip',
101
+ });
102
+ }
103
+
104
+ this.logger.info(`Crunching page cache @${date}+${offset} done.`);
105
+
106
+ return true;
107
+ }
108
+
109
+ @CloudScheduleV2('2 0 * * *', {
110
+ name: 'crunchPageCacheEveryday',
111
+ runtime: {
112
+ cpu: 2,
113
+ memory: '4GiB',
114
+ timeoutSeconds: 1800,
115
+ timeZone: 'UTC',
116
+ retryCount: 3,
117
+ minBackoffSeconds: 60,
118
+ },
119
+ tags: ['DataCrunching'],
120
+ })
121
+ // @CloudHTTPv2({
122
+ // runtime: {
123
+ // cpu: 2,
124
+ // memory: '4GiB',
125
+ // timeoutSeconds: 3600,
126
+ // concurrency: 2,
127
+ // maxInstances: 200,
128
+ // },
129
+ // tags: ['DataCrunching'],
130
+ // })
131
+ async dispatchPageCacheCrunching(
132
+ @RPCReflect() rpcReflect: RPCReflection,
133
+ ) {
134
+ const sse = new OutputServerEventStream({ highWaterMark: 4096 });
135
+ rpcReflect.return(sse);
136
+ rpcReflect.catch((err) => {
137
+ sse.end({ data: `Error: ${err.message}` });
138
+ });
139
+ for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
140
+ this.logger.info(`Dispatching ${fileName}...`);
141
+ sse.write({ data: `Dispatching ${fileName}...` });
142
+
143
+ await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
144
+ dispatchDeadlineSeconds: 1800,
145
+ uri: await getFunctionUrl('crunchPageCacheWorker'),
146
+ });
147
+ }
148
+
149
+ sse.end({ data: 'done' });
150
+ sse.resume();
151
+
152
+ return true;
153
+ }
154
+
155
+ async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
156
+ const startOfToday = dayjs().utc().startOf('day');
157
+ const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
158
+ let theDay = startingPoint;
159
+
160
+ if (date) {
161
+ theDay = dayjs(date).utc().startOf('day');
162
+ }
163
+
164
+ let counter = 0;
165
+ if (inputOffset) {
166
+ counter = parseInt(inputOffset as string, 10);
167
+ }
168
+
169
+ while (theDay.isBefore(startOfToday)) {
170
+ const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
171
+ const offset = counter;
172
+ counter += this.pageCacheCrunchingBatchSize;
173
+ const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
174
+ if (fileExists) {
175
+ continue;
176
+ }
177
+
178
+ const records = await Crawled.fromFirestoreQuery(Crawled.COLLECTION
179
+ .where('createdAt', '>=', theDay.toDate())
180
+ .where('createdAt', '<', theDay.add(1, 'day').toDate())
181
+ .orderBy('createdAt', 'asc')
182
+ .offset(offset)
183
+ .limit(this.pageCacheCrunchingBatchSize)
184
+ );
185
+
186
+ this.logger.info(`Found ${records.length} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
187
+
188
+ if (!records.length) {
189
+ if (date) {
190
+ break;
191
+ }
192
+ theDay = theDay.add(1, 'day');
193
+ counter = 0;
194
+ continue;
195
+ }
196
+
197
+ yield { fileName, records };
198
+
199
+ if (offset) {
200
+ break;
201
+ }
202
+ }
203
+ }
204
+
205
+ async* iterPageCacheChunks() {
206
+ const startOfToday = dayjs().utc().startOf('day');
207
+ const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
208
+ let theDay = startingPoint;
209
+
210
+ let counter = 0;
211
+
212
+ while (theDay.isBefore(startOfToday)) {
213
+ const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
214
+ const offset = counter;
215
+ counter += this.pageCacheCrunchingBatchSize;
216
+ const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
217
+ if (fileExists) {
218
+ continue;
219
+ }
220
+
221
+ const nRecords = (await Crawled.COLLECTION
222
+ .where('createdAt', '>=', theDay.toDate())
223
+ .where('createdAt', '<', theDay.add(1, 'day').toDate())
224
+ .orderBy('createdAt', 'asc')
225
+ .offset(offset)
226
+ .limit(this.pageCacheCrunchingBatchSize)
227
+ .count().get()).data().count;
228
+
229
+ this.logger.info(`Found ${nRecords} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
230
+ if (nRecords < this.pageCacheCrunchingBatchSize) {
231
+ theDay = theDay.add(1, 'day');
232
+ counter = 0;
233
+ }
234
+ if (nRecords) {
235
+ yield { fileName, date: theDay.toISOString(), offset };
236
+ }
237
+
238
+ continue;
239
+ }
240
+ }
241
+
242
+ async crunchCacheRecords(records: Crawled[]) {
243
+ const throttle = new PromiseThrottle(30);
244
+ const localFilePath = this.tempFileManager.alloc();
245
+ let nextDrainDeferred = Defer();
246
+ nextDrainDeferred.resolve();
247
+
248
+ for (const record of records) {
249
+ await throttle.acquire();
250
+ this.firebaseObjectStorage.downloadFile(`snapshots/${record._id}`)
251
+ .then(async (snapshotTxt) => {
252
+ try {
253
+ const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
254
+
255
+ let formatted = await this.crawler.formatSnapshot('default', snapshot);
256
+ if (!formatted.content) {
257
+ formatted = await this.crawler.formatSnapshot('markdown', snapshot);
258
+ }
259
+
260
+ await nextDrainDeferred.promise;
261
+ await appendFile(localFilePath, JSON.stringify({
262
+ url: snapshot.href,
263
+ title: snapshot.title || '',
264
+ html: snapshot.html || '',
265
+ text: snapshot.text || '',
266
+ content: formatted.content || '',
267
+ }) + '\n', { encoding: 'utf-8' });
268
+
269
+ } catch (err) {
270
+ this.logger.warn(`Failed to parse snapshot for ${record._id}`, { err });
271
+ }
272
+ })
273
+ .finally(() => {
274
+ throttle.release();
275
+ });
276
+ }
277
+
278
+ await throttle.nextDrain();
279
+
280
+
281
+ const ro = {
282
+ path: localFilePath
283
+ };
284
+
285
+ this.tempFileManager.bindPathTo(ro, localFilePath);
286
+
287
+ return ro;
288
+ }
289
+ }
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -19,6 +19,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
19
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
20
  import { SearchResult } from '../db/searched';
21
  import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
 
22
 
23
 
24
  @singleton()
@@ -145,7 +146,8 @@ export class SearcherHost extends RPCHost {
145
  req: Request,
146
  res: Response,
147
  },
148
- auth: JinaEmbeddingsAuthDTO
 
149
  ) {
150
  const uid = await auth.solveUID();
151
  let chargeAmount = 0;
@@ -201,18 +203,7 @@ export class SearcherHost extends RPCHost {
201
  });
202
  }
203
 
204
- const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
205
- const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
206
- const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
207
- const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
208
- const noCache = Boolean(ctx.req.get('x-no-cache'));
209
- let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
210
- if (isNaN(pageCacheTolerance)) {
211
- pageCacheTolerance = this.pageCacheToleranceMs;
212
- if (noCache) {
213
- pageCacheTolerance = 0;
214
- }
215
- }
216
  const cookies: CookieParam[] = [];
217
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
218
  if (Array.isArray(setCookieHeaders)) {
@@ -226,27 +217,19 @@ export class SearcherHost extends RPCHost {
226
  ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
227
  });
228
  }
229
- this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
230
- this.threadLocal.set('withLinksSummary', withLinksSummary);
231
- this.threadLocal.set('withImagesSummary', withImagesSummary);
232
-
233
- const crawlOpts: ScrappingOptions = {
234
- proxyUrl: ctx.req.get('x-proxy-url'),
235
- cookies,
236
- favorScreenshot: customMode === 'screenshot'
237
- };
238
-
239
  const searchQuery = noSlashPath;
240
  const r = await this.cachedWebSearch({
241
  q: searchQuery,
242
  count: 10
243
- }, noCache);
244
 
245
  if (!r.web?.results.length) {
246
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
247
  }
248
 
249
- const it = this.fetchSearchResults(customMode, r.web?.results, crawlOpts, pageCacheTolerance);
 
 
250
 
251
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
252
  const sseStream = new OutputServerEventStream();
 
19
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
20
  import { SearchResult } from '../db/searched';
21
  import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
22
+ import { CrawlerOptions } from '../dto/scrapping-options';
23
 
24
 
25
  @singleton()
 
146
  req: Request,
147
  res: Response,
148
  },
149
+ auth: JinaEmbeddingsAuthDTO,
150
+ crawlerOptions: CrawlerOptions,
151
  ) {
152
  const uid = await auth.solveUID();
153
  let chargeAmount = 0;
 
203
  });
204
  }
205
 
206
+ const crawlOpts = this.crawler.configure(crawlerOptions);
 
 
 
 
 
 
 
 
 
 
 
207
  const cookies: CookieParam[] = [];
208
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
209
  if (Array.isArray(setCookieHeaders)) {
 
217
  ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
218
  });
219
  }
 
 
 
 
 
 
 
 
 
 
220
  const searchQuery = noSlashPath;
221
  const r = await this.cachedWebSearch({
222
  q: searchQuery,
223
  count: 10
224
+ }, crawlerOptions.noCache);
225
 
226
  if (!r.web?.results.length) {
227
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
228
  }
229
 
230
+ const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
231
+ crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
232
+ );
233
 
234
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
235
  const sseStream = new OutputServerEventStream();
backend/functions/src/db/pdf.ts ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Also, Prop, parseJSONText } from 'civkit';
2
+ import { FirestoreRecord } from '../shared/lib/firestore';
3
+ import _ from 'lodash';
4
+
5
+ @Also({
6
+ dictOf: Object
7
+ })
8
+ export class PDFContent extends FirestoreRecord {
9
+ static override collectionName = 'pdfs';
10
+
11
+ override _id!: string;
12
+
13
+ @Prop({
14
+ required: true
15
+ })
16
+ src!: string;
17
+
18
+ @Prop({
19
+ required: true
20
+ })
21
+ urlDigest!: string;
22
+
23
+ @Prop()
24
+ meta?: { [k: string]: any; };
25
+
26
+ @Prop()
27
+ text?: string;
28
+
29
+ @Prop()
30
+ content?: string;
31
+
32
+ @Prop()
33
+ createdAt!: Date;
34
+
35
+ @Prop()
36
+ expireAt?: Date;
37
+
38
+ static patchedFields = [
39
+ 'meta'
40
+ ];
41
+
42
+ static override from(input: any) {
43
+ for (const field of this.patchedFields) {
44
+ if (typeof input[field] === 'string') {
45
+ input[field] = parseJSONText(input[field]);
46
+ }
47
+ }
48
+
49
+ return super.from(input) as PDFContent;
50
+ }
51
+
52
+ override degradeForFireStore() {
53
+ const copy: any = { ...this };
54
+
55
+ for (const field of (this.constructor as typeof PDFContent).patchedFields) {
56
+ if (typeof copy[field] === 'object') {
57
+ copy[field] = JSON.stringify(copy[field]) as any;
58
+ }
59
+ }
60
+
61
+ return copy;
62
+ }
63
+
64
+ [k: string]: any;
65
+ }
backend/functions/src/dto/scrapping-options.ts ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
+ import type { Request, Response } from 'express';
3
+ import type { CookieParam } from 'puppeteer';
4
+ import { parseString as parseSetCookieString } from 'set-cookie-parser';
5
+
6
+ export class CrawlerOptions extends AutoCastable {
7
+
8
+ @Prop({
9
+ default: 'default',
10
+ })
11
+ respondWith!: string;
12
+
13
+ @Prop({
14
+ default: false,
15
+ })
16
+ withGeneratedAlt!: boolean;
17
+
18
+ @Prop({
19
+ default: false,
20
+ })
21
+ withLinksSummary!: boolean;
22
+
23
+ @Prop({
24
+ default: false,
25
+ })
26
+ withImagesSummary!: boolean;
27
+
28
+ @Prop({
29
+ default: false,
30
+ })
31
+ noCache!: boolean;
32
+
33
+ @Prop()
34
+ cacheTolerance?: number;
35
+
36
+ @Prop()
37
+ targetSelector?: string;
38
+
39
+ @Prop()
40
+ waitForSelector?: string;
41
+
42
+ @Prop({
43
+ arrayOf: String,
44
+ })
45
+ setCookies?: CookieParam[];
46
+
47
+ @Prop()
48
+ proxyUrl?: string;
49
+
50
+ static override from(input: any) {
51
+ const instance = super.from(input) as CrawlerOptions;
52
+ const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
53
+ req: Request,
54
+ res: Response,
55
+ };
56
+
57
+ const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format');
58
+ if (customMode !== undefined) {
59
+ instance.respondWith = customMode;
60
+ }
61
+
62
+ const withGeneratedAlt = ctx.req.get('x-with-generated-alt');
63
+ if (withGeneratedAlt !== undefined) {
64
+ instance.withGeneratedAlt = Boolean(withGeneratedAlt);
65
+ }
66
+ const withLinksSummary = ctx.req.get('x-with-links-summary');
67
+ if (withLinksSummary !== undefined) {
68
+ instance.withLinksSummary = Boolean(withLinksSummary);
69
+ }
70
+ const withImagesSummary = ctx.req.get('x-with-images-summary');
71
+ if (withImagesSummary !== undefined) {
72
+ instance.withImagesSummary = Boolean(withImagesSummary);
73
+ }
74
+ const noCache = ctx.req.get('x-no-cache');
75
+ if (noCache !== undefined) {
76
+ instance.noCache = Boolean(noCache);
77
+ if (instance.noCache && instance.cacheTolerance === undefined) {
78
+ instance.cacheTolerance = 0;
79
+ }
80
+ }
81
+ let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '');
82
+ if (!isNaN(cacheTolerance)) {
83
+ instance.cacheTolerance = cacheTolerance;
84
+ }
85
+
86
+ const targetSelector = ctx.req.get('x-target-selector');
87
+ instance.targetSelector ??= targetSelector;
88
+ const waitForSelector = ctx.req.get('x-wait-for-selector');
89
+ instance.waitForSelector ??= waitForSelector || instance.targetSelector;
90
+
91
+ const cookies: CookieParam[] = [];
92
+ const setCookieHeaders = ctx.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
93
+ if (Array.isArray(setCookieHeaders)) {
94
+ for (const setCookie of setCookieHeaders) {
95
+ cookies.push({
96
+ ...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
97
+ });
98
+ }
99
+ } else if (setCookieHeaders && typeof setCookieHeaders === 'string') {
100
+ cookies.push({
101
+ ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
102
+ });
103
+ }
104
+
105
+ const proxyUrl = ctx.req.get('x-proxy-url');
106
+ instance.proxyUrl ??= proxyUrl;
107
+
108
+ return instance;
109
+ }
110
+ }
backend/functions/src/services/alt-text.ts CHANGED
@@ -6,7 +6,6 @@ import { ImageInterrogationManager } from '../shared/services/common-iminterroga
6
  import { ImgBrief } from './puppeteer';
7
  import { ImgAlt } from '../db/img-alt';
8
 
9
-
10
  const md5Hasher = new HashManager('md5', 'hex');
11
 
12
  @singleton()
 
6
  import { ImgBrief } from './puppeteer';
7
  import { ImgAlt } from '../db/img-alt';
8
 
 
9
  const md5Hasher = new HashManager('md5', 'hex');
10
 
11
  @singleton()
backend/functions/src/services/pdf-extract.ts ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import 'core-js/actual/promise/with-resolvers';
2
+ import { singleton } from 'tsyringe';
3
+ import _ from 'lodash';
4
+ import { TextItem } from 'pdfjs-dist/types/src/display/api';
5
+ import { AsyncService, HashManager } from 'civkit';
6
+ import { Logger } from '../shared/services/logger';
7
+ import { PDFContent } from '../db/pdf';
8
+ import dayjs from 'dayjs';
9
+ const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
10
+ dayjs.extend(utc); // Extend dayjs with the UTC plugin
11
+ const timezone = require('dayjs/plugin/timezone');
12
+ dayjs.extend(timezone);
13
+
14
+ const pPdfjs = import('pdfjs-dist');
15
+
16
+
17
+ const md5Hasher = new HashManager('md5', 'hex');
18
+
19
+ function stdDev(numbers: number[]) {
20
+ const mean = _.mean(numbers);
21
+ const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
22
+ const avgSquareDiff = _.mean(squareDiffs);
23
+ return Math.sqrt(avgSquareDiff);
24
+ }
25
+
26
+ function isRotatedByAtLeast35Degrees(transform: [number, number, number, number, number, number]): boolean {
27
+ const [a, b, c, d, _e, _f] = transform;
28
+
29
+ // Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
30
+ const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b
31
+ const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d
32
+
33
+ // Either angle1 or angle2 can be used to determine the rotation, they should be equivalent
34
+ const rotationAngle1 = Math.abs(angle1);
35
+ const rotationAngle2 = Math.abs(angle2);
36
+
37
+ // Check if the absolute rotation angle is greater than or equal to 35 degrees
38
+ return rotationAngle1 >= 35 || rotationAngle2 >= 35;
39
+ }
40
+
41
+ @singleton()
42
+ export class PDFExtractor extends AsyncService {
43
+
44
+ logger = this.globalLogger.child({ service: this.constructor.name });
45
+ pdfjs!: Awaited<typeof pPdfjs>;
46
+
47
+ constructor(
48
+ protected globalLogger: Logger,
49
+ ) {
50
+ super(...arguments);
51
+ }
52
+
53
+ override async init() {
54
+ await this.dependencyReady();
55
+ this.pdfjs = await pPdfjs;
56
+
57
+ this.emit('ready');
58
+ }
59
+
60
+ async extract(url: string | URL) {
61
+ const loadingTask = this.pdfjs.getDocument({
62
+ url,
63
+ disableFontFace: true,
64
+ verbosity: 0
65
+ });
66
+
67
+ const doc = await loadingTask.promise;
68
+ const meta = await doc.getMetadata();
69
+
70
+ const textItems: TextItem[][] = [];
71
+
72
+ for (const pg of _.range(0, doc.numPages)) {
73
+ const page = await doc.getPage(pg + 1);
74
+ const textContent = await page.getTextContent();
75
+ textItems.push((textContent.items as TextItem[]));
76
+ }
77
+
78
+ const articleCharHeights: number[] = [];
79
+ for (const textItem of textItems.flat()) {
80
+ if (textItem.height) {
81
+ articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height));
82
+ }
83
+ }
84
+ const articleAvgHeight = _.mean(articleCharHeights);
85
+ const articleStdDevHeight = stdDev(articleCharHeights);
86
+ // const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)];
87
+ const mdOps: Array<{
88
+ text: string;
89
+ op?: 'new' | 'append';
90
+ mode: 'h1' | 'h2' | 'p' | 'appendix' | 'space';
91
+ }> = [];
92
+
93
+ const rawChunks: string[] = [];
94
+
95
+ let op: 'append' | 'new' = 'new';
96
+ let mode: 'h1' | 'h2' | 'p' | 'space' | 'appendix' = 'p';
97
+ for (const pageTextItems of textItems) {
98
+ const charHeights = [];
99
+ for (const textItem of pageTextItems as TextItem[]) {
100
+ if (textItem.height) {
101
+ charHeights.push(...Array(textItem.str.length).fill(textItem.height));
102
+ }
103
+ rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`);
104
+ }
105
+
106
+ const avgHeight = _.mean(charHeights);
107
+ const stdDevHeight = stdDev(charHeights);
108
+ // const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)];
109
+
110
+ for (const textItem of pageTextItems) {
111
+ if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) {
112
+ mode = 'h1';
113
+ } else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) {
114
+ mode = 'h2';
115
+ } else if (textItem.height && textItem.height < avgHeight - stdDevHeight) {
116
+ mode = 'appendix';
117
+ } else if (textItem.height) {
118
+ mode = 'p';
119
+ } else {
120
+ mode = 'space';
121
+ }
122
+
123
+ if (isRotatedByAtLeast35Degrees(textItem.transform as any)) {
124
+ mode = 'appendix';
125
+ }
126
+
127
+ mdOps.push({
128
+ op,
129
+ mode,
130
+ text: textItem.str
131
+ });
132
+
133
+ if (textItem.hasEOL && !textItem.str) {
134
+ op = 'new';
135
+ } else {
136
+ op = 'append';
137
+ }
138
+ }
139
+ }
140
+
141
+ const mdChunks = [];
142
+ const appendixChunks = [];
143
+ mode = 'space';
144
+ for (const x of mdOps) {
145
+ const previousMode: string = mode;
146
+ const changeToMdChunks = [];
147
+
148
+ const isNewStart = x.mode !== 'space' && (x.op === 'new' || (previousMode === 'appendix' && x.mode !== previousMode));
149
+
150
+ if (isNewStart) {
151
+ switch (x.mode) {
152
+ case 'h1': {
153
+ changeToMdChunks.push(`\n\n# `);
154
+ mode = x.mode;
155
+ break;
156
+ }
157
+
158
+ case 'h2': {
159
+ changeToMdChunks.push(`\n\n## `);
160
+ mode = x.mode;
161
+ break;
162
+ }
163
+
164
+ case 'p': {
165
+ changeToMdChunks.push(`\n\n`);
166
+ mode = x.mode;
167
+ break;
168
+ }
169
+
170
+ case 'appendix': {
171
+ mode = x.mode;
172
+ appendixChunks.push(`\n\n`);
173
+ break;
174
+ }
175
+
176
+ default: {
177
+ break;
178
+ }
179
+ }
180
+ } else {
181
+ if (x.mode === 'appendix' && appendixChunks.length) {
182
+ const lastChunk = appendixChunks[appendixChunks.length - 1];
183
+ if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
184
+ appendixChunks.push(' ');
185
+ }
186
+ } else if (mdChunks.length) {
187
+ const lastChunk = mdChunks[mdChunks.length - 1];
188
+ if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
189
+ changeToMdChunks.push(' ');
190
+ }
191
+ }
192
+ }
193
+
194
+ if (x.text) {
195
+ if (x.mode == 'appendix') {
196
+ if (appendixChunks.length || isNewStart) {
197
+ appendixChunks.push(x.text);
198
+ } else {
199
+ changeToMdChunks.push(x.text);
200
+ }
201
+ } else {
202
+ changeToMdChunks.push(x.text);
203
+ }
204
+ }
205
+
206
+ if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) {
207
+ const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n');
208
+ changeToMdChunks.unshift(appendix);
209
+ changeToMdChunks.unshift(`\n\n`);
210
+ appendixChunks.length = 0;
211
+ }
212
+
213
+ if (x.mode === 'space' && changeToMdChunks.length) {
214
+ changeToMdChunks.length = 1;
215
+ }
216
+ if (changeToMdChunks.length) {
217
+ mdChunks.push(...changeToMdChunks);
218
+ }
219
+ }
220
+
221
+ if (mdChunks.length) {
222
+ mdChunks[0] = mdChunks[0].trimStart();
223
+ }
224
+
225
+ return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
226
+ }
227
+
228
+ async cachedExtract(url: string | URL) {
229
+ if (!url) {
230
+ return undefined;
231
+ }
232
+
233
+ const digest = md5Hasher.hash(url.toString());
234
+ const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
235
+
236
+ const existing = await PDFContent.fromFirestore(shortDigest);
237
+
238
+ if (existing) {
239
+ return {
240
+ meta: existing.meta,
241
+ content: existing.content,
242
+ text: existing.text
243
+ };
244
+ }
245
+
246
+ let extracted;
247
+
248
+ try {
249
+ extracted = await this.extract(url);
250
+ } catch (err) {
251
+ this.logger.warn(`Unable to extract from pdf ${url}`, { err });
252
+ }
253
+
254
+ // Don't try again until the next day
255
+ const expireMixin = extracted ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
256
+
257
+ await PDFContent.COLLECTION.doc(shortDigest).set(
258
+ {
259
+ _id: shortDigest,
260
+ src: url.toString(),
261
+ meta: extracted?.meta || {},
262
+ content: extracted?.content || '',
263
+ text: extracted?.text || '',
264
+ urlDigest: digest,
265
+ createdAt: new Date(),
266
+ ...expireMixin
267
+ }, { merge: true }
268
+ );
269
+
270
+ return extracted;
271
+ }
272
+
273
+ parsePdfDate(pdfDate: string | undefined) {
274
+ if (!pdfDate) {
275
+ return undefined;
276
+ }
277
+ // Remove the 'D:' prefix
278
+ const cleanedDate = pdfDate.slice(2);
279
+
280
+ // Define the format without the timezone part first
281
+ const dateTimePart = cleanedDate.slice(0, 14);
282
+ const timezonePart = cleanedDate.slice(14);
283
+
284
+ // Construct the full date string in a standard format
285
+ const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`;
286
+
287
+ // Parse the date with timezone
288
+ const parsedDate = dayjs(formattedDate, "YYYYMMDDHHmmssZ");
289
+
290
+ const date = parsedDate.toDate();
291
+
292
+ if (!date.valueOf()) {
293
+ return undefined;
294
+ }
295
+
296
+ return date;
297
+ }
298
+ }
backend/functions/src/services/puppeteer.ts CHANGED
@@ -50,6 +50,7 @@ export interface PageSnapshot {
50
  parsed?: Partial<ReadabilityParsed> | null;
51
  screenshot?: Buffer;
52
  imgs?: ImgBrief[];
 
53
  }
54
 
55
  export interface ExtendedSnapshot extends PageSnapshot {
@@ -62,6 +63,7 @@ export interface ScrappingOptions {
62
  cookies?: CookieParam[];
63
  favorScreenshot?: boolean;
64
  waitForSelector?: string;
 
65
  }
66
 
67
 
@@ -97,7 +99,9 @@ export class PuppeteerControl extends AsyncService {
97
  livePages = new Set<Page>();
98
  lastPageCratedAt: number = 0;
99
 
100
- constructor(protected globalLogger: Logger) {
 
 
101
  super(...arguments);
102
  this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
103
 
@@ -219,7 +223,17 @@ function briefImgs(elem) {
219
  };
220
  });
221
  }
222
- function giveSnapshot() {
 
 
 
 
 
 
 
 
 
 
223
  let parsed;
224
  try {
225
  parsed = new Readability(document.cloneNode(true)).parse();
@@ -234,6 +248,7 @@ function giveSnapshot() {
234
  text: document.body?.innerText,
235
  parsed: parsed,
236
  imgs: [],
 
237
  };
238
  if (parsed && parsed.content) {
239
  const elem = document.createElement('div');
@@ -277,7 +292,7 @@ function giveSnapshot() {
277
  });
278
 
279
  await page.evaluateOnNewDocument(`
280
- let aftershot = undefined;
281
  const handlePageLoad = () => {
282
  if (window.haltSnapshot) {
283
  return;
@@ -285,26 +300,23 @@ const handlePageLoad = () => {
285
  if (document.readyState !== 'complete') {
286
  return;
287
  }
288
- const parsed = giveSnapshot();
289
- window.reportSnapshot(parsed);
290
- if (!parsed.text) {
291
- if (aftershot) {
292
- clearTimeout(aftershot);
293
- }
294
- aftershot = setTimeout(() => {
295
- const r = giveSnapshot();
296
- if (r && r.text) {
297
- window.reportSnapshot(r);
298
- }
299
- }, 500);
300
  }
 
 
 
301
  };
 
302
  document.addEventListener('readystatechange', handlePageLoad);
303
  document.addEventListener('load', handlePageLoad);
304
  `);
305
 
306
  this.snMap.set(page, sn);
307
- this.logger.warn(`Page ${sn} created.`);
308
  this.lastPageCratedAt = Date.now();
309
  this.livePages.add(page);
310
 
@@ -409,12 +421,12 @@ document.addEventListener('load', handlePageLoad);
409
  finalized = true;
410
  return;
411
  }
412
- snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
413
  screenshot = await page.screenshot();
414
- if (!snapshot.title || !snapshot.parsed?.content) {
415
  const salvaged = await this.salvage(url, page);
416
  if (salvaged) {
417
- snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
418
  screenshot = await page.screenshot();
419
  }
420
  }
@@ -429,7 +441,7 @@ document.addEventListener('load', handlePageLoad);
429
  if (options?.waitForSelector) {
430
  page.waitForSelector(options.waitForSelector)
431
  .then(async () => {
432
- snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
433
  screenshot = await page.screenshot();
434
  finalized = true;
435
  nextSnapshotDeferred.resolve(snapshot);
@@ -442,7 +454,11 @@ document.addEventListener('load', handlePageLoad);
442
  try {
443
  let lastHTML = snapshot?.html;
444
  while (true) {
445
- await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
 
 
 
 
446
  if (finalized) {
447
  yield { ...snapshot, screenshot } as PageSnapshot;
448
  break;
 
50
  parsed?: Partial<ReadabilityParsed> | null;
51
  screenshot?: Buffer;
52
  imgs?: ImgBrief[];
53
+ pdfs?: string[];
54
  }
55
 
56
  export interface ExtendedSnapshot extends PageSnapshot {
 
63
  cookies?: CookieParam[];
64
  favorScreenshot?: boolean;
65
  waitForSelector?: string;
66
+ minIntervalMs?: number;
67
  }
68
 
69
 
 
99
  livePages = new Set<Page>();
100
  lastPageCratedAt: number = 0;
101
 
102
+ constructor(
103
+ protected globalLogger: Logger,
104
+ ) {
105
  super(...arguments);
106
  this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
107
 
 
223
  };
224
  });
225
  }
226
+ function briefPDFs() {
227
+ const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
228
+
229
+ return pdfTags.map((x)=> {
230
+ return x.src === 'about:blank' ? document.location.href : x.src;
231
+ });
232
+ }
233
+ function giveSnapshot(stopActiveSnapshot) {
234
+ if (stopActiveSnapshot) {
235
+ window.haltSnapshot = true;
236
+ }
237
  let parsed;
238
  try {
239
  parsed = new Readability(document.cloneNode(true)).parse();
 
248
  text: document.body?.innerText,
249
  parsed: parsed,
250
  imgs: [],
251
+ pdfs: briefPDFs(),
252
  };
253
  if (parsed && parsed.content) {
254
  const elem = document.createElement('div');
 
292
  });
293
 
294
  await page.evaluateOnNewDocument(`
295
+ let lastTextLength = 0;
296
  const handlePageLoad = () => {
297
  if (window.haltSnapshot) {
298
  return;
 
300
  if (document.readyState !== 'complete') {
301
  return;
302
  }
303
+ const thisTextLength = (document.body.innerText || '').length;
304
+ const deltaLength = Math.abs(thisTextLength - lastTextLength);
305
+ if (10 * deltaLength < lastTextLength) {
306
+ // Change is not significant
307
+ return;
 
 
 
 
 
 
 
308
  }
309
+ const r = giveSnapshot();
310
+ window.reportSnapshot(r);
311
+ lastTextLength = thisTextLength;
312
  };
313
+ setInterval(handlePageLoad, 500);
314
  document.addEventListener('readystatechange', handlePageLoad);
315
  document.addEventListener('load', handlePageLoad);
316
  `);
317
 
318
  this.snMap.set(page, sn);
319
+ this.logger.info(`Page ${sn} created.`);
320
  this.lastPageCratedAt = Date.now();
321
  this.livePages.add(page);
322
 
 
421
  finalized = true;
422
  return;
423
  }
424
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
425
  screenshot = await page.screenshot();
426
+ if ((!snapshot.title || !snapshot.parsed?.content) && !(snapshot.pdfs?.length)) {
427
  const salvaged = await this.salvage(url, page);
428
  if (salvaged) {
429
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
430
  screenshot = await page.screenshot();
431
  }
432
  }
 
441
  if (options?.waitForSelector) {
442
  page.waitForSelector(options.waitForSelector)
443
  .then(async () => {
444
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
445
  screenshot = await page.screenshot();
446
  finalized = true;
447
  nextSnapshotDeferred.resolve(snapshot);
 
454
  try {
455
  let lastHTML = snapshot?.html;
456
  while (true) {
457
+ const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
458
+ if (options?.minIntervalMs) {
459
+ ckpt.push(delay(options.minIntervalMs));
460
+ }
461
+ await Promise.race(ckpt);
462
  if (finalized) {
463
  yield { ...snapshot, screenshot } as PageSnapshot;
464
  break;
backend/functions/tsconfig.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "compilerOptions": {
3
- "module": "commonjs",
 
4
  "noImplicitReturns": true,
5
  "noUnusedLocals": true,
6
  "outDir": "build",
 
1
  {
2
  "compilerOptions": {
3
+ "module": "node16",
4
+
5
  "noImplicitReturns": true,
6
  "noUnusedLocals": true,
7
  "outDir": "build",
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 17ee85dd08becd8a86acf993ae7952d8f911b05e
 
1
+ Subproject commit b0b597800a36e2aa8ee3d52715aa7c998b388f47