nomagick commited on
Commit
53821d0
·
unverified ·
1 Parent(s): 80b9a6a

fix: lm and related options

Browse files
backend/functions/package-lock.json CHANGED
@@ -16,7 +16,7 @@
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
  "busboy": "^1.6.0",
19
- "civkit": "^0.8.2-2eddf1b",
20
  "core-js": "^3.37.1",
21
  "cors": "^2.8.5",
22
  "dayjs": "^1.11.9",
@@ -3979,9 +3979,10 @@
3979
  }
3980
  },
3981
  "node_modules/civkit": {
3982
- "version": "0.8.2-2eddf1b",
3983
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-2eddf1b.tgz",
3984
- "integrity": "sha512-iRYQKasePTQYIajPZpTh+uQn09XF7e6+tBaFwxs7mlUIHoU8ci8CT307ITYMnppDLzCh7BRpSgt53mz4Jwg78w==",
 
3985
  "dependencies": {
3986
  "lodash": "^4.17.21",
3987
  "tslib": "^2.5.0"
 
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
  "busboy": "^1.6.0",
19
+ "civkit": "^0.8.2-4c0357a",
20
  "core-js": "^3.37.1",
21
  "cors": "^2.8.5",
22
  "dayjs": "^1.11.9",
 
3979
  }
3980
  },
3981
  "node_modules/civkit": {
3982
+ "version": "0.8.2-4c0357a",
3983
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz",
3984
+ "integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==",
3985
+ "license": "AGPL",
3986
  "dependencies": {
3987
  "lodash": "^4.17.21",
3988
  "tslib": "^2.5.0"
backend/functions/package.json CHANGED
@@ -36,7 +36,7 @@
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
38
  "busboy": "^1.6.0",
39
- "civkit": "^0.8.2-2eddf1b",
40
  "core-js": "^3.37.1",
41
  "cors": "^2.8.5",
42
  "dayjs": "^1.11.9",
 
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
38
  "busboy": "^1.6.0",
39
+ "civkit": "^0.8.2-4c0357a",
40
  "core-js": "^3.37.1",
41
  "cors": "^2.8.5",
42
  "dayjs": "^1.11.9",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -15,7 +15,7 @@ import { randomUUID } from 'crypto';
15
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
16
 
17
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
18
- import { CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
19
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
20
  import { DomainBlockade } from '../db/domain-blockade';
21
  import { DomainProfile } from '../db/domain-profile';
@@ -84,14 +84,6 @@ export class CrawlerHost extends RPCHost {
84
  Reflect.set(snapshot, 'locale', options.locale);
85
  }
86
  await this.setToCache(options.url, snapshot);
87
-
88
- if (!options.engine) {
89
- try {
90
- await this.exploreDirectEngine(options.url, options, snapshot);
91
- } catch (err) {
92
- this.logger.warn(`Failed to explore direct engine option for ${options.url.href}`, { err });
93
- }
94
- }
95
  });
96
 
97
  puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@@ -152,8 +144,8 @@ export class CrawlerHost extends RPCHost {
152
  memory: '4GiB',
153
  cpu: 2,
154
  timeoutSeconds: 300,
155
- concurrency: 8,
156
- maxInstances: 1250,
157
  minInstances: 1,
158
  },
159
  tags: ['Crawler'],
@@ -260,25 +252,12 @@ export class CrawlerHost extends RPCHost {
260
 
261
 
262
  const crawlOpts = await this.configure(crawlerOptions);
263
-
264
- if (!crawlOpts.engine) {
265
- const domainProfile = (await DomainProfile.fromFirestoreQuery(
266
- DomainProfile.COLLECTION
267
- .where('origin', '==', targetUrl.origin.toLowerCase())
268
- .limit(1)
269
- ))[0];
270
-
271
- if (domainProfile?.engine) {
272
- crawlOpts.engine = domainProfile.engine;
273
- }
274
- }
275
-
276
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
277
  const sseStream = new OutputServerEventStream();
278
  rpcReflect.return(sseStream);
279
 
280
  try {
281
- for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
282
  if (!scrapped) {
283
  continue;
284
  }
@@ -311,7 +290,7 @@ export class CrawlerHost extends RPCHost {
311
 
312
  let lastScrapped;
313
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
314
- for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
315
  lastScrapped = scrapped;
316
  if (!crawlerOptions.isEarlyReturnApplicable()) {
317
  continue;
@@ -357,7 +336,7 @@ export class CrawlerHost extends RPCHost {
357
  });
358
  }
359
 
360
- for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
361
  lastScrapped = scrapped;
362
 
363
  if (!crawlerOptions.isEarlyReturnApplicable()) {
@@ -589,82 +568,78 @@ export class CrawlerHost extends RPCHost {
589
  return r;
590
  }
591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
  async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
593
- let overrideFinalSnapshot;
594
  if (crawlerOpts?.html) {
595
- overrideFinalSnapshot = {
596
  href: urlToCrawl.toString(),
597
  html: crawlerOpts.html,
598
  title: '',
599
  text: '',
600
  } as PageSnapshot;
 
 
 
601
  }
602
 
603
  if (crawlerOpts?.pdf) {
604
  const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
605
  const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
606
- overrideFinalSnapshot = {
607
  href: urlToCrawl.toString(),
608
  html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
609
  title: '',
610
  text: '',
611
  pdfs: [pdfDataUrl],
612
  } as PageSnapshot;
613
- }
614
 
615
- if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
616
- yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
617
 
618
  return;
619
  }
620
 
621
- // if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
622
- // const rmSelectorEquivalent = [];
623
- // if (typeof crawlOpts.removeSelector === 'string') {
624
- // rmSelectorEquivalent.push(crawlOpts.removeSelector);
625
- // } else if (Array.isArray(crawlOpts.removeSelector)) {
626
- // rmSelectorEquivalent.push(...crawlOpts.removeSelector);
627
- // }
628
- // rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
629
-
630
- // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
631
- // ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
632
- // }, crawlerOpts);
633
-
634
- // yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
635
-
636
- // return;
637
- // }
638
-
639
- if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
640
- const rmSelectorEquivalent = [];
641
- if (typeof crawlOpts.removeSelector === 'string') {
642
- rmSelectorEquivalent.push(crawlOpts.removeSelector);
643
- } else if (Array.isArray(crawlOpts.removeSelector)) {
644
- rmSelectorEquivalent.push(...crawlOpts.removeSelector);
645
- }
646
- rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
647
-
648
- const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
649
- ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
650
- }, crawlerOpts);
651
-
652
- if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
653
- const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
654
- yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
655
 
656
  return;
 
 
 
 
657
  }
658
-
659
- yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
660
-
661
- return;
662
- }
663
-
664
- if (overrideFinalSnapshot) {
665
- yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts);
666
-
667
- return;
668
  }
669
 
670
  let cache;
@@ -857,12 +832,14 @@ export class CrawlerHost extends RPCHost {
857
  nominalUrl?: URL,
858
  urlValidMs?: number
859
  ) {
860
- const engine = crawlerOptions.engine?.toLowerCase() || '';
861
- if (engine.includes('lm')) {
 
 
862
  const output: FormattedPage = {
863
  title: snapshot.title,
864
  content: snapshot.parsed?.textContent,
865
- url: snapshot.href,
866
  [Symbol.dispose]: () => undefined,
867
  };
868
 
@@ -874,7 +851,7 @@ export class CrawlerHost extends RPCHost {
874
  return output;
875
  }
876
 
877
- return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
878
  }
879
 
880
  async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
@@ -902,7 +879,7 @@ export class CrawlerHost extends RPCHost {
902
  }
903
 
904
  async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
905
- const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
906
 
907
  let lastSnapshot;
908
  let goodEnough = false;
@@ -936,7 +913,7 @@ export class CrawlerHost extends RPCHost {
936
  }
937
 
938
  async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
939
- const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions);
940
 
941
  const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
942
  const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
 
15
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
16
 
17
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
18
+ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
19
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
20
  import { DomainBlockade } from '../db/domain-blockade';
21
  import { DomainProfile } from '../db/domain-profile';
 
84
  Reflect.set(snapshot, 'locale', options.locale);
85
  }
86
  await this.setToCache(options.url, snapshot);
 
 
 
 
 
 
 
 
87
  });
88
 
89
  puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
 
144
  memory: '4GiB',
145
  cpu: 2,
146
  timeoutSeconds: 300,
147
+ concurrency: 10,
148
+ maxInstances: 1000,
149
  minInstances: 1,
150
  },
151
  tags: ['Crawler'],
 
252
 
253
 
254
  const crawlOpts = await this.configure(crawlerOptions);
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
256
  const sseStream = new OutputServerEventStream();
257
  rpcReflect.return(sseStream);
258
 
259
  try {
260
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
261
  if (!scrapped) {
262
  continue;
263
  }
 
290
 
291
  let lastScrapped;
292
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
293
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
294
  lastScrapped = scrapped;
295
  if (!crawlerOptions.isEarlyReturnApplicable()) {
296
  continue;
 
336
  });
337
  }
338
 
339
+ for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
340
  lastScrapped = scrapped;
341
 
342
  if (!crawlerOptions.isEarlyReturnApplicable()) {
 
568
  return r;
569
  }
570
 
571
+ async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
572
+ // if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) {
573
+ // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
574
+ // ...crawlOpts, engine: ENGINE_TYPE.BROWSER
575
+ // }, crawlerOpts);
576
+
577
+ // yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
578
+
579
+ // return;
580
+ // }
581
+
582
+ if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
583
+ const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
584
+ ...crawlOpts, engine: ENGINE_TYPE.AUTO
585
+ }, crawlerOpts);
586
+
587
+ if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
588
+ const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
589
+ yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
590
+
591
+ return;
592
+ }
593
+
594
+ yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
595
+
596
+ return;
597
+ }
598
+
599
+ yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts);
600
+ }
601
+
602
  async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
 
603
  if (crawlerOpts?.html) {
604
+ const snapshot = {
605
  href: urlToCrawl.toString(),
606
  html: crawlerOpts.html,
607
  title: '',
608
  text: '',
609
  } as PageSnapshot;
610
+ yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
611
+
612
+ return;
613
  }
614
 
615
  if (crawlerOpts?.pdf) {
616
  const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
617
  const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
618
+ const snapshot = {
619
  href: urlToCrawl.toString(),
620
  html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
621
  title: '',
622
  text: '',
623
  pdfs: [pdfDataUrl],
624
  } as PageSnapshot;
 
625
 
626
+ yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
 
627
 
628
  return;
629
  }
630
 
631
+ if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) {
632
+ const engine = crawlOpts?.engine;
633
+ try {
634
+ const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
635
+ yield snapshot;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
  return;
638
+ } catch (err) {
639
+ if (!engine.endsWith('?')) {
640
+ throw err;
641
+ }
642
  }
 
 
 
 
 
 
 
 
 
 
643
  }
644
 
645
  let cache;
 
832
  nominalUrl?: URL,
833
  urlValidMs?: number
834
  ) {
835
+ const presumedURL = crawlerOptions.base === 'eventual' ? new URL(snapshot.href) : nominalUrl;
836
+
837
+ const respondWith = crawlerOptions.respondWith;
838
+ if (respondWith === CONTENT_FORMAT.READER_LM || respondWith === CONTENT_FORMAT.VLM) {
839
  const output: FormattedPage = {
840
  title: snapshot.title,
841
  content: snapshot.parsed?.textContent,
842
+ url: presumedURL?.href || snapshot.href,
843
  [Symbol.dispose]: () => undefined,
844
  };
845
 
 
851
  return output;
852
  }
853
 
854
+ return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs);
855
  }
856
 
857
  async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
 
879
  }
880
 
881
  async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
882
+ const it = this.iterSnapshots(url, { ...opts, minIntervalMs: 500 });
883
 
884
  let lastSnapshot;
885
  let goodEnough = false;
 
913
  }
914
 
915
  async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
916
+ const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true);
917
 
918
  const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
919
  const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
  import type { Request, Response } from 'express';
3
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
4
 
@@ -9,9 +9,12 @@ export enum CONTENT_FORMAT {
9
  TEXT = 'text',
10
  PAGESHOT = 'pageshot',
11
  SCREENSHOT = 'screenshot',
 
 
12
  }
13
 
14
  export enum ENGINE_TYPE {
 
15
  BROWSER = 'browser',
16
  DIRECT = 'direct',
17
  VLM = 'vlm',
@@ -22,6 +25,8 @@ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
22
 
23
  export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
24
  const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
 
 
25
 
26
  class Viewport extends AutoCastable {
27
  @Prop({
@@ -193,6 +198,11 @@ class Viewport extends AutoCastable {
193
  in: 'header',
194
  schema: { type: 'string' }
195
  },
 
 
 
 
 
196
  }
197
  }
198
  }
@@ -205,6 +215,12 @@ export class CrawlerOptions extends AutoCastable {
205
  @Prop()
206
  html?: string;
207
 
 
 
 
 
 
 
208
  @Prop({
209
  desc: 'Base64 encoded PDF.',
210
  type: [File, String]
@@ -228,7 +244,7 @@ export class CrawlerOptions extends AutoCastable {
228
  @Prop({
229
  default: false,
230
  })
231
- withLinksSummary!: boolean;
232
 
233
  @Prop({
234
  default: false,
@@ -335,6 +351,17 @@ export class CrawlerOptions extends AutoCastable {
335
  if (customMode !== undefined) {
336
  instance.respondWith = customMode;
337
  }
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  const locale = ctx?.req.get('x-locale');
340
  if (locale !== undefined) {
@@ -352,7 +379,11 @@ export class CrawlerOptions extends AutoCastable {
352
  }
353
  const withLinksSummary = ctx?.req.get('x-with-links-summary');
354
  if (withLinksSummary !== undefined) {
355
- instance.withLinksSummary = Boolean(withLinksSummary);
 
 
 
 
356
  }
357
  const withImagesSummary = ctx?.req.get('x-with-images-summary');
358
  if (withImagesSummary !== undefined) {
@@ -403,8 +434,15 @@ export class CrawlerOptions extends AutoCastable {
403
  if (engine) {
404
  instance.engine = engine;
405
  }
406
- if (instance.noCache || !instance.isTypicalRequest()) {
407
- instance.engine ??= ENGINE_TYPE.BROWSER;
 
 
 
 
 
 
 
408
  }
409
 
410
  const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
@@ -451,10 +489,17 @@ export class CrawlerOptions extends AutoCastable {
451
  const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
452
  instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
453
 
 
 
 
454
  if (instance.cacheTolerance) {
455
  instance.cacheTolerance = instance.cacheTolerance * 1000;
456
  }
457
 
 
 
 
 
458
  return instance;
459
  }
460
 
@@ -468,7 +513,7 @@ export class CrawlerOptions extends AutoCastable {
468
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
469
  return false;
470
  }
471
- if (this.engine?.toLowerCase().includes('lm')) {
472
  return false;
473
  }
474
 
 
1
+ import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
  import type { Request, Response } from 'express';
3
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
4
 
 
9
  TEXT = 'text',
10
  PAGESHOT = 'pageshot',
11
  SCREENSHOT = 'screenshot',
12
+ VLM = 'vlm',
13
+ READER_LM = 'readerlm-v2',
14
  }
15
 
16
  export enum ENGINE_TYPE {
17
+ AUTO = 'auto',
18
  BROWSER = 'browser',
19
  DIRECT = 'direct',
20
  VLM = 'vlm',
 
25
 
26
  export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
27
  const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
28
+ export const BASE_URL_MODES = ['initial', 'eventual'] as const;
29
+ const BASE_URL_MODE_VALUES = new Set<string>(BASE_URL_MODES);
30
 
31
  class Viewport extends AutoCastable {
32
  @Prop({
 
198
  in: 'header',
199
  schema: { type: 'string' }
200
  },
201
+ 'X-Base': {
202
+ description: 'Select base modes of relative URLs.\n\nSupported: initial, eventual',
203
+ in: 'header',
204
+ schema: { type: 'string' }
205
+ },
206
  }
207
  }
208
  }
 
215
  @Prop()
216
  html?: string;
217
 
218
+ @Prop({
219
+ type: BASE_URL_MODE_VALUES,
220
+ default: 'initial',
221
+ })
222
+ base?: typeof BASE_URL_MODES[number];
223
+
224
  @Prop({
225
  desc: 'Base64 encoded PDF.',
226
  type: [File, String]
 
244
  @Prop({
245
  default: false,
246
  })
247
+ withLinksSummary!: boolean | string;
248
 
249
  @Prop({
250
  default: false,
 
351
  if (customMode !== undefined) {
352
  instance.respondWith = customMode;
353
  }
354
+ if (instance.respondWith) {
355
+ instance.respondWith = instance.respondWith.toLowerCase();
356
+ }
357
+ if (instance.respondWith?.includes('lm')) {
358
+ if (instance.respondWith.includes('content') || instance.respondWith.includes('markdown')) {
359
+ throw new ParamValidationError({
360
+ path: 'respondWith',
361
+ message: `LM formats conflicts with content/markdown.`,
362
+ });
363
+ }
364
+ }
365
 
366
  const locale = ctx?.req.get('x-locale');
367
  if (locale !== undefined) {
 
379
  }
380
  const withLinksSummary = ctx?.req.get('x-with-links-summary');
381
  if (withLinksSummary !== undefined) {
382
+ if (withLinksSummary === 'all') {
383
+ instance.withLinksSummary = withLinksSummary;
384
+ } else {
385
+ instance.withLinksSummary = Boolean(withLinksSummary);
386
+ }
387
  }
388
  const withImagesSummary = ctx?.req.get('x-with-images-summary');
389
  if (withImagesSummary !== undefined) {
 
434
  if (engine) {
435
  instance.engine = engine;
436
  }
437
+ if (instance.engine) {
438
+ instance.engine = instance.engine.toLowerCase();
439
+ }
440
+ if (instance.engine === ENGINE_TYPE.VLM) {
441
+ instance.engine = ENGINE_TYPE.BROWSER;
442
+ instance.respondWith = CONTENT_FORMAT.VLM;
443
+ } else if (instance.engine === ENGINE_TYPE.READER_LM) {
444
+ instance.engine = undefined;
445
+ instance.respondWith = CONTENT_FORMAT.READER_LM;
446
  }
447
 
448
  const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
 
489
  const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
490
  instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
491
 
492
+ const baseMode = ctx?.req.get('x-base') || undefined;
493
+ instance.base ??= baseMode as any;
494
+
495
  if (instance.cacheTolerance) {
496
  instance.cacheTolerance = instance.cacheTolerance * 1000;
497
  }
498
 
499
+ if (instance.noCache || !instance.isTypicalRequest()) {
500
+ instance.engine ??= ENGINE_TYPE.BROWSER + '?';
501
+ }
502
+
503
  return instance;
504
  }
505
 
 
513
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
514
  return false;
515
  }
516
+ if (this.respondWith.includes('lm')) {
517
  return false;
518
  }
519
 
backend/functions/src/services/curl.ts CHANGED
@@ -26,7 +26,7 @@ export class CurlControl extends AsyncService {
26
  this.emit('ready');
27
  }
28
 
29
- async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
30
  const result = await new Promise<{
31
  statusCode: number,
32
  data: string,
@@ -75,7 +75,7 @@ export class CurlControl extends AsyncService {
75
  curl.perform();
76
  });
77
 
78
- if (result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
79
  throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
80
  }
81
 
 
26
  this.emit('ready');
27
  }
28
 
29
+ async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
30
  const result = await new Promise<{
31
  statusCode: number,
32
  data: string,
 
75
  curl.perform();
76
  });
77
 
78
+ if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
79
  throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
80
  }
81
 
backend/functions/src/services/jsdom.ts CHANGED
@@ -6,6 +6,7 @@ import { Readability } from '@mozilla/readability';
6
  import TurndownService from 'turndown';
7
  import { Threaded } from '../shared/services/threaded';
8
  import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
 
9
 
10
  const pLinkedom = import('linkedom');
11
 
@@ -184,26 +185,20 @@ export class JSDomControl extends AsyncService {
184
 
185
  jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
186
  const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
187
- .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
188
- .map(([href, text]) => {
189
- if (!text) {
190
  return undefined;
191
  }
192
  try {
193
  const parsed = new URL(href, snapshot.rebase || snapshot.href);
194
- if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
195
- return undefined;
196
- }
197
- return [parsed.toString(), text] as const;
198
  } catch (err) {
199
  return undefined;
200
  }
201
  })
202
- .filter(Boolean)
203
- .reduce((acc, pair) => {
204
- acc[pair![0]] = pair![1];
205
- return acc;
206
- }, {} as { [k: string]: string; });
207
 
208
  extendedSnapshot.links = links;
209
 
@@ -237,6 +232,56 @@ export class JSDomControl extends AsyncService {
237
 
238
  return extendedSnapshot;
239
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  snippetToElement(snippet?: string, url?: string) {
241
  const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
242
 
 
6
  import TurndownService from 'turndown';
7
  import { Threaded } from '../shared/services/threaded';
8
  import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
9
+ import { tailwindClasses } from '../utils/tailwind-classes';
10
 
11
  const pLinkedom = import('linkedom');
12
 
 
185
 
186
  jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
187
  const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
188
+ .map((x: any) => [x.textContent.replace(/\s+/g, ' ').trim(), x.getAttribute('href'),])
189
+ .map(([text, href]) => {
190
+ if (!href) {
191
  return undefined;
192
  }
193
  try {
194
  const parsed = new URL(href, snapshot.rebase || snapshot.href);
195
+
196
+ return [text, parsed.toString()] as const;
 
 
197
  } catch (err) {
198
  return undefined;
199
  }
200
  })
201
+ .filter(Boolean) as [string, string][];
 
 
 
 
202
 
203
  extendedSnapshot.links = links;
204
 
 
232
 
233
  return extendedSnapshot;
234
  }
235
+
236
+ cleanRedundantEmptyLines(text: string) {
237
+ const lines = text.split(/\r?\n/g);
238
+ const mappedFlag = lines.map((line) => Boolean(line.trim()));
239
+
240
+ return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
241
+ }
242
+
243
+ @Threaded()
244
+ async cleanHTMLforLMs(sourceHTML: string, ...discardSelectors: string[]): Promise<string> {
245
+ const t0 = Date.now();
246
+ let jsdom = this.linkedom.parseHTML(sourceHTML);
247
+ if (!jsdom.window.document.documentElement) {
248
+ jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`);
249
+ }
250
+
251
+ for (const rl of discardSelectors) {
252
+ jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
253
+ }
254
+
255
+ jsdom.window.document.querySelectorAll('img[src],img[data-src]').forEach((x) => {
256
+ const src = x.getAttribute('src') || x.getAttribute('data-src');
257
+ if (src?.startsWith('data:')) {
258
+ x.setAttribute('src', 'blob:opaque');
259
+ }
260
+ x.removeAttribute('data-src');
261
+ x.removeAttribute('srcset');
262
+ });
263
+
264
+ jsdom.window.document.querySelectorAll('[class]').forEach((x) => {
265
+ const classes = x.getAttribute('class')?.split(/\s+/g) || [];
266
+ const newClasses = classes.filter((c) => tailwindClasses.has(c));
267
+ x.setAttribute('class', newClasses.join(' '));
268
+ });
269
+ jsdom.window.document.querySelectorAll('[style]').forEach((x) => {
270
+ const style = x.getAttribute('style')?.toLocaleLowerCase() || '';
271
+ if (style.startsWith('display: none')) {
272
+ return;
273
+ }
274
+ x.removeAttribute('style');
275
+ });
276
+
277
+ const dt = Date.now() - t0;
278
+ if (dt > 1000) {
279
+ this.logger.warn(`Performance issue: Cleaning HTML for LMs took ${dt}ms`, { dt });
280
+ }
281
+
282
+ return this.cleanRedundantEmptyLines(jsdom.window.document.documentElement.outerHTML);
283
+ }
284
+
285
  snippetToElement(snippet?: string, url?: string) {
286
  const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
287
 
backend/functions/src/services/lm.ts CHANGED
@@ -6,6 +6,7 @@ import { Logger } from '../shared/services/logger';
6
  import _ from 'lodash';
7
  import { AssertionFailureError } from 'civkit';
8
  import { LLMManager } from '../shared/services/common-llm';
 
9
 
10
  const tripleBackTick = '```';
11
 
@@ -16,7 +17,8 @@ export class LmControl extends AsyncService {
16
 
17
  constructor(
18
  protected globalLogger: Logger,
19
- protected commonLLM: LLMManager
 
20
  ) {
21
  super(...arguments);
22
  }
@@ -27,13 +29,6 @@ export class LmControl extends AsyncService {
27
  this.emit('ready');
28
  }
29
 
30
- cleanRedundantEmptyLines(text: string) {
31
- const lines = text.split(/\r?\n/g);
32
- const mappedFlag = lines.map((line) => Boolean(line.trim()));
33
-
34
- return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
35
- }
36
-
37
  async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
38
  pageshotUrl?: string,
39
  }) {
@@ -43,9 +38,11 @@ export class LmControl extends AsyncService {
43
  throw new AssertionFailureError('Screenshot of the page is not available');
44
  }
45
 
 
 
46
  const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
47
  prompt: [
48
- `HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
49
  typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
50
  `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
51
  ],
@@ -76,8 +73,11 @@ export class LmControl extends AsyncService {
76
  if (!snapshot) {
77
  throw new AssertionFailureError('Snapshot of the page is not available');
78
  }
 
 
 
79
  const it = this.commonLLM.iterRun('readerlm-v2', {
80
- prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
81
 
82
  options: {
83
  // system: 'You are an AI assistant developed by Jina AI',
@@ -105,8 +105,11 @@ export class LmControl extends AsyncService {
105
  if (!snapshot) {
106
  throw new AssertionFailureError('Snapshot of the page is not available');
107
  }
 
 
 
108
  const it = this.commonLLM.iterRun('readerlm-v2', {
109
- prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
110
  options: {
111
  // system: 'You are an AI assistant developed by Jina AI',
112
  stream: true
 
6
  import _ from 'lodash';
7
  import { AssertionFailureError } from 'civkit';
8
  import { LLMManager } from '../shared/services/common-llm';
9
+ import { JSDomControl } from './jsdom';
10
 
11
  const tripleBackTick = '```';
12
 
 
17
 
18
  constructor(
19
  protected globalLogger: Logger,
20
+ protected commonLLM: LLMManager,
21
+ protected jsdomControl: JSDomControl,
22
  ) {
23
  super(...arguments);
24
  }
 
29
  this.emit('ready');
30
  }
31
 
 
 
 
 
 
 
 
32
  async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
33
  pageshotUrl?: string,
34
  }) {
 
38
  throw new AssertionFailureError('Screenshot of the page is not available');
39
  }
40
 
41
+ const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg')
42
+
43
  const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
44
  prompt: [
45
+ `HTML: \n${html}\n\nSCREENSHOT: \n`,
46
  typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
47
  `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
48
  ],
 
73
  if (!snapshot) {
74
  throw new AssertionFailureError('Snapshot of the page is not available');
75
  }
76
+
77
+ const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
78
+
79
  const it = this.commonLLM.iterRun('readerlm-v2', {
80
+ prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n`,
81
 
82
  options: {
83
  // system: 'You are an AI assistant developed by Jina AI',
 
105
  if (!snapshot) {
106
  throw new AssertionFailureError('Snapshot of the page is not available');
107
  }
108
+
109
+ const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
110
+
111
  const it = this.commonLLM.iterRun('readerlm-v2', {
112
+ prompt: `${instruction}\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
113
  options: {
114
  // system: 'You are an AI assistant developed by Jina AI',
115
  stream: true
backend/functions/src/services/puppeteer.ts CHANGED
@@ -63,7 +63,7 @@ export interface PageSnapshot {
63
  }
64
 
65
  export interface ExtendedSnapshot extends PageSnapshot {
66
- links: { [url: string]: string; };
67
  imgs: ImgBrief[];
68
  }
69
 
 
63
  }
64
 
65
  export interface ExtendedSnapshot extends PageSnapshot {
66
+ links: [string, string][];
67
  imgs: ImgBrief[];
68
  }
69
 
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -28,8 +28,8 @@ export interface FormattedPage {
28
  screenshot?: Buffer;
29
  pageshotUrl?: string;
30
  pageshot?: Buffer;
31
- links?: { [k: string]: string; };
32
- images?: { [k: string]: string; };
33
  warning?: string;
34
  usage?: {
35
  total_tokens?: number;
@@ -56,7 +56,7 @@ export function highlightedCodeBlock(turndownService: TurndownService) {
56
  highlightRegExp.test(node.className)
57
  );
58
  },
59
- replacement: (_content, node, options)=> {
60
  const className = (node as any).className || '';
61
  const language = (className.match(highlightRegExp) || [null, ''])[1];
62
 
@@ -178,7 +178,14 @@ export class SnapshotFormatter extends AsyncService {
178
  Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true });
179
  }
180
 
181
- if (modeOK && !mode.includes('markdown') && !mode.includes('content')) {
 
 
 
 
 
 
 
182
  const dt = Date.now() - t0;
183
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
184
 
@@ -391,7 +398,13 @@ export class SnapshotFormatter extends AsyncService {
391
  .value();
392
  }
393
  if (this.threadLocal.get('withLinksSummary')) {
394
- formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
 
 
 
 
 
 
395
  }
396
 
397
  Object.assign(f, formatted);
@@ -418,8 +431,14 @@ export class SnapshotFormatter extends AsyncService {
418
  }
419
  if (this.links) {
420
  const linkSummaryChunks = ['Links/Buttons:'];
421
- for (const [k, v] of Object.entries(this.links)) {
422
- linkSummaryChunks.push(`- [${k}](${v})`);
 
 
 
 
 
 
423
  }
424
  if (linkSummaryChunks.length === 1) {
425
  linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
@@ -478,7 +497,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
478
  }
479
  if (this.threadLocal.get('withLinksSummary')) {
480
  inferred ??= this.jsdomControl.inferSnapshot(snapshot);
481
- mixin.links = _.invert(inferred.links || {});
 
 
 
 
482
  }
483
  if (snapshot.status) {
484
  const code = snapshot.status;
 
28
  screenshot?: Buffer;
29
  pageshotUrl?: string;
30
  pageshot?: Buffer;
31
+ links?: { [k: string]: string; } | [string, string][];
32
+ images?: { [k: string]: string; } | [string, string][];
33
  warning?: string;
34
  usage?: {
35
  total_tokens?: number;
 
56
  highlightRegExp.test(node.className)
57
  );
58
  },
59
+ replacement: (_content, node, options) => {
60
  const className = (node as any).className || '';
61
  const language = (className.match(highlightRegExp) || [null, ''])[1];
62
 
 
178
  Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true });
179
  }
180
 
181
+ if (mode.includes('lm')) {
182
+ modeOK = true;
183
+ f.content = snapshot.parsed?.textContent;
184
+ }
185
+
186
+ if (modeOK && (mode.includes('lm') ||
187
+ (!mode.includes('markdown') && !mode.includes('content')))
188
+ ) {
189
  const dt = Date.now() - t0;
190
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
191
 
 
398
  .value();
399
  }
400
  if (this.threadLocal.get('withLinksSummary')) {
401
+ const links = this.jsdomControl.inferSnapshot(snapshot).links;
402
+
403
+ if (this.threadLocal.get('withLinksSummary') === 'all') {
404
+ formatted.links = links;
405
+ } else {
406
+ formatted.links = _.fromPairs(links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
407
+ }
408
  }
409
 
410
  Object.assign(f, formatted);
 
431
  }
432
  if (this.links) {
433
  const linkSummaryChunks = ['Links/Buttons:'];
434
+ if (Array.isArray(this.links)) {
435
+ for (const [k, v] of this.links) {
436
+ linkSummaryChunks.push(`- [${k}](${v})`);
437
+ }
438
+ } else {
439
+ for (const [k, v] of Object.entries(this.links)) {
440
+ linkSummaryChunks.push(`- [${k}](${v})`);
441
+ }
442
  }
443
  if (linkSummaryChunks.length === 1) {
444
  linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
 
497
  }
498
  if (this.threadLocal.get('withLinksSummary')) {
499
  inferred ??= this.jsdomControl.inferSnapshot(snapshot);
500
+ if (this.threadLocal.get('withLinksSummary') === 'all') {
501
+ mixin.links = inferred.links;
502
+ } else {
503
+ mixin.links = _.fromPairs(inferred.links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
504
+ }
505
  }
506
  if (snapshot.status) {
507
  const code = snapshot.status;
backend/functions/src/utils/tailwind-classes.ts ADDED
The diff for this file is too large to render. See raw diff