nomagick commited on
Commit
06f3593
·
unverified ·
1 Parent(s): 51a4877

feat: new lm engine

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
23
  import { JSDomControl } from '../services/jsdom';
24
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
25
  import { CurlControl } from '../services/curl';
26
- import { VlmControl } from '../services/vlm';
27
 
28
  export interface ExtraScrappingOptions extends ScrappingOptions {
29
  withIframe?: boolean | 'quoted';
@@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost {
58
  protected globalLogger: Logger,
59
  protected puppeteerControl: PuppeteerControl,
60
  protected curlControl: CurlControl,
61
- protected vlmControl: VlmControl,
62
  protected jsdomControl: JSDomControl,
63
  protected snapshotFormatter: SnapshotFormatter,
64
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
@@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost {
284
  }
285
 
286
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
287
- chargeAmount = this.assignChargeAmount(formatted);
288
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
289
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
290
  }
@@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost {
321
  }
322
 
323
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
324
- chargeAmount = this.assignChargeAmount(formatted);
325
 
326
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
327
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost {
342
  }
343
 
344
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
345
- chargeAmount = this.assignChargeAmount(formatted);
346
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
347
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
348
  }
@@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost {
369
  }
370
 
371
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
372
- chargeAmount = this.assignChargeAmount(formatted);
373
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
374
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
375
  }
@@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost {
398
  }
399
 
400
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
401
- chargeAmount = this.assignChargeAmount(formatted);
402
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
403
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
404
  }
@@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost {
625
  return;
626
  }
627
 
628
- if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
629
- const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
 
 
 
 
 
 
630
 
631
- yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
 
633
  return;
634
  }
@@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost {
669
  }
670
  }
671
 
672
- assignChargeAmount(formatted: FormattedPage) {
673
  if (!formatted) {
674
  return 0;
675
  }
676
 
677
  let amount = 0;
678
  if (formatted.content) {
679
- amount += estimateToken(formatted.content);
 
 
 
 
680
  } else if (formatted.description) {
681
  amount += estimateToken(formatted.description);
682
  }
@@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost {
819
  nominalUrl?: URL,
820
  urlValidMs?: number
821
  ) {
822
- if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
823
  const output: FormattedPage = {
824
  title: snapshot.title,
825
  content: snapshot.parsed?.textContent,
 
23
  import { JSDomControl } from '../services/jsdom';
24
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
25
  import { CurlControl } from '../services/curl';
26
+ import { LmControl } from '../services/lm';
27
 
28
  export interface ExtraScrappingOptions extends ScrappingOptions {
29
  withIframe?: boolean | 'quoted';
 
58
  protected globalLogger: Logger,
59
  protected puppeteerControl: PuppeteerControl,
60
  protected curlControl: CurlControl,
61
+ protected lmControl: LmControl,
62
  protected jsdomControl: JSDomControl,
63
  protected snapshotFormatter: SnapshotFormatter,
64
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
 
284
  }
285
 
286
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
287
+ chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
288
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
289
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
290
  }
 
321
  }
322
 
323
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
324
+ chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
325
 
326
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
327
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
342
  }
343
 
344
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
345
+ chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
346
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
347
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
348
  }
 
369
  }
370
 
371
  const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
372
+ chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
373
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
374
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
375
  }
 
398
  }
399
 
400
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
401
+ chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
402
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
403
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
404
  }
 
625
  return;
626
  }
627
 
628
+ // if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
629
+ // const rmSelectorEquivalent = [];
630
+ // if (typeof crawlOpts.removeSelector === 'string') {
631
+ // rmSelectorEquivalent.push(crawlOpts.removeSelector);
632
+ // } else if (Array.isArray(crawlOpts.removeSelector)) {
633
+ // rmSelectorEquivalent.push(...crawlOpts.removeSelector);
634
+ // }
635
+ // rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
636
 
637
+ // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
638
+ // ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
639
+ // }, crawlerOpts);
640
+
641
+ // yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
642
+
643
+ // return;
644
+ // }
645
+
646
+ if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
647
+ const rmSelectorEquivalent = [];
648
+ if (typeof crawlOpts.removeSelector === 'string') {
649
+ rmSelectorEquivalent.push(crawlOpts.removeSelector);
650
+ } else if (Array.isArray(crawlOpts.removeSelector)) {
651
+ rmSelectorEquivalent.push(...crawlOpts.removeSelector);
652
+ }
653
+ rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
654
+
655
+ const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
656
+ ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
657
+ }, crawlerOpts);
658
+
659
+ if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
660
+ const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
661
+ yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
662
+
663
+ return;
664
+ }
665
+
666
+ yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
667
 
668
  return;
669
  }
 
704
  }
705
  }
706
 
707
+ assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
708
  if (!formatted) {
709
  return 0;
710
  }
711
 
712
  let amount = 0;
713
  if (formatted.content) {
714
+ const x1 = estimateToken(formatted.content);
715
+ if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
716
+ amount += x1 * 2;
717
+ }
718
+ amount += x1;
719
  } else if (formatted.description) {
720
  amount += estimateToken(formatted.description);
721
  }
 
858
  nominalUrl?: URL,
859
  urlValidMs?: number
860
  ) {
861
+ if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
862
  const output: FormattedPage = {
863
  title: snapshot.title,
864
  content: snapshot.parsed?.textContent,
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -15,6 +15,7 @@ export enum ENGINE_TYPE {
15
  BROWSER = 'browser',
16
  DIRECT = 'direct',
17
  VLM = 'vlm',
 
18
  }
19
 
20
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
@@ -188,7 +189,7 @@ class Viewport extends AutoCastable {
188
  schema: { type: 'string' }
189
  },
190
  'X-Engine': {
191
- description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
192
  in: 'header',
193
  schema: { type: 'string' }
194
  },
@@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable {
317
  @Prop()
318
  viewport?: Viewport;
319
 
 
 
 
 
 
 
320
  static override from(input: any) {
321
  const instance = super.from(input) as CrawlerOptions;
322
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable {
461
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
462
  return false;
463
  }
464
- if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
465
  return false;
466
  }
467
 
 
15
  BROWSER = 'browser',
16
  DIRECT = 'direct',
17
  VLM = 'vlm',
18
+ READER_LM = 'readerlm-v2',
19
  }
20
 
21
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
 
189
  schema: { type: 'string' }
190
  },
191
  'X-Engine': {
192
+ description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2',
193
  in: 'header',
194
  schema: { type: 'string' }
195
  },
 
318
  @Prop()
319
  viewport?: Viewport;
320
 
321
+ @Prop()
322
+ instruction?: string;
323
+
324
+ @Prop()
325
+ jsonSchema?: object;
326
+
327
  static override from(input: any) {
328
  const instance = super.from(input) as CrawlerOptions;
329
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
 
468
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
469
  return false;
470
  }
471
+ if (this.engine?.toLowerCase().includes('lm')) {
472
  return false;
473
  }
474
 
backend/functions/src/services/{vlm.ts → lm.ts} RENAMED
@@ -7,8 +7,10 @@ import _ from 'lodash';
7
  import { AssertionFailureError } from 'civkit';
8
  import { LLMManager } from '../shared/services/common-llm';
9
 
 
 
10
  @singleton()
11
- export class VlmControl extends AsyncService {
12
 
13
  logger = this.globalLogger.child({ service: this.constructor.name });
14
 
@@ -25,7 +27,14 @@ export class VlmControl extends AsyncService {
25
  this.emit('ready');
26
  }
27
 
28
- async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
 
 
 
 
 
 
 
29
  pageshotUrl?: string,
30
  }) {
31
  const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
@@ -36,12 +45,70 @@ export class VlmControl extends AsyncService {
36
 
37
  const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
38
  prompt: [
 
39
  typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
40
- `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
41
  ],
42
 
43
  options: {
44
- system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  stream: true
46
  }
47
  });
 
7
  import { AssertionFailureError } from 'civkit';
8
  import { LLMManager } from '../shared/services/common-llm';
9
 
10
+ const tripleBackTick = '```';
11
+
12
  @singleton()
13
+ export class LmControl extends AsyncService {
14
 
15
  logger = this.globalLogger.child({ service: this.constructor.name });
16
 
 
27
  this.emit('ready');
28
  }
29
 
30
+ cleanRedundantEmptyLines(text: string) {
31
+ const lines = text.split(/\r?\n/g);
32
+ const mappedFlag = lines.map((line) => Boolean(line.trim()));
33
+
34
+ return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
35
+ }
36
+
37
+ async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
38
  pageshotUrl?: string,
39
  }) {
40
  const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
 
45
 
46
  const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
47
  prompt: [
48
+ `HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
49
  typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
50
+ `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
51
  ],
52
 
53
  options: {
54
+ system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
55
+ stream: true
56
+ }
57
+ });
58
+
59
+ const chunks: string[] = [];
60
+ for await (const txt of it) {
61
+ chunks.push(txt);
62
+ const output: PageSnapshot = {
63
+ ...snapshot,
64
+ parsed: {
65
+ ...snapshot?.parsed,
66
+ textContent: chunks.join(''),
67
+ }
68
+ };
69
+ yield output;
70
+ }
71
+
72
+ return;
73
+ }
74
+
75
+ async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
76
+ if (!snapshot) {
77
+ throw new AssertionFailureError('Snapshot of the page is not available');
78
+ }
79
+ const it = this.commonLLM.iterRun('readerlm-v2', {
80
+ prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
81
+
82
+ options: {
83
+ // system: 'You are an AI assistant developed by Jina AI',
84
+ stream: true
85
+ }
86
+ });
87
+
88
+ const chunks: string[] = [];
89
+ for await (const txt of it) {
90
+ chunks.push(txt);
91
+ const output: PageSnapshot = {
92
+ ...snapshot,
93
+ parsed: {
94
+ ...snapshot?.parsed,
95
+ textContent: chunks.join(''),
96
+ }
97
+ };
98
+ yield output;
99
+ }
100
+
101
+ return;
102
+ }
103
+
104
+ async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
105
+ if (!snapshot) {
106
+ throw new AssertionFailureError('Snapshot of the page is not available');
107
+ }
108
+ const it = this.commonLLM.iterRun('readerlm-v2', {
109
+ prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
110
+ options: {
111
+ // system: 'You are an AI assistant developed by Jina AI',
112
  stream: true
113
  }
114
  });
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
 
1
+ Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f