nomagick Shasha2020 commited on
Commit
51a4877
·
unverified ·
1 Parent(s): c19ba65

feat: gemini to replace blip2 (#1129)

Browse files

* feat: domain profile

* fix

* fix

* fix

* fix

* fix

* refactor: curl as direct engine

* fix

* wip

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: Sha Zhou <sha.zhou@jina.ai>

backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -23,6 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
23
  import { JSDomControl } from '../services/jsdom';
24
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
25
  import { CurlControl } from '../services/curl';
 
26
 
27
  export interface ExtraScrappingOptions extends ScrappingOptions {
28
  withIframe?: boolean | 'quoted';
@@ -57,6 +58,7 @@ export class CrawlerHost extends RPCHost {
57
  protected globalLogger: Logger,
58
  protected puppeteerControl: PuppeteerControl,
59
  protected curlControl: CurlControl,
 
60
  protected jsdomControl: JSDomControl,
61
  protected snapshotFormatter: SnapshotFormatter,
62
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
@@ -281,7 +283,7 @@ export class CrawlerHost extends RPCHost {
281
  continue;
282
  }
283
 
284
- const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
285
  chargeAmount = this.assignChargeAmount(formatted);
286
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
287
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -311,24 +313,25 @@ export class CrawlerHost extends RPCHost {
311
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
312
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
313
  lastScrapped = scrapped;
 
 
 
314
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
315
  continue;
316
  }
317
 
318
- const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
319
  chargeAmount = this.assignChargeAmount(formatted);
320
 
321
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
322
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
323
  }
324
 
325
- if (crawlerOptions.isEarlyReturnApplicable()) {
326
- return formatted;
327
  }
328
 
329
- if (chargeAmount && scrapped?.pdfs?.length) {
330
- return formatted;
331
- }
332
  }
333
 
334
  if (!lastScrapped) {
@@ -338,7 +341,7 @@ export class CrawlerHost extends RPCHost {
338
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
339
  }
340
 
341
- const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
342
  chargeAmount = this.assignChargeAmount(formatted);
343
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
344
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -356,32 +359,35 @@ export class CrawlerHost extends RPCHost {
356
 
357
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
358
  lastScrapped = scrapped;
 
 
 
 
 
359
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
360
  continue;
361
  }
362
 
363
- const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
364
  chargeAmount = this.assignChargeAmount(formatted);
365
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
366
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
367
  }
368
 
369
- if (crawlerOptions.isEarlyReturnApplicable()) {
370
- if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
371
-
372
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
373
- { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
374
- );
375
- }
376
- if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
377
 
378
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
379
- { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
380
- );
381
- }
 
382
 
383
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
 
 
384
  }
 
 
385
  }
386
 
387
  if (!lastScrapped) {
@@ -391,7 +397,7 @@ export class CrawlerHost extends RPCHost {
391
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
392
  }
393
 
394
- const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
395
  chargeAmount = this.assignChargeAmount(formatted);
396
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
397
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -619,6 +625,14 @@ export class CrawlerHost extends RPCHost {
619
  return;
620
  }
621
 
 
 
 
 
 
 
 
 
622
  let cache;
623
 
624
  if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
@@ -765,6 +779,10 @@ export class CrawlerHost extends RPCHost {
765
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
766
  }
767
 
 
 
 
 
768
  if (opts.injectFrameScript?.length) {
769
  crawlOpts.injectFrameScripts = (await Promise.all(
770
  opts.injectFrameScript.map((x) => {
@@ -792,6 +810,59 @@ export class CrawlerHost extends RPCHost {
792
  return crawlOpts;
793
  }
794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
796
  const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
797
 
 
23
  import { JSDomControl } from '../services/jsdom';
24
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
25
  import { CurlControl } from '../services/curl';
26
+ import { VlmControl } from '../services/vlm';
27
 
28
  export interface ExtraScrappingOptions extends ScrappingOptions {
29
  withIframe?: boolean | 'quoted';
 
58
  protected globalLogger: Logger,
59
  protected puppeteerControl: PuppeteerControl,
60
  protected curlControl: CurlControl,
61
+ protected vlmControl: VlmControl,
62
  protected jsdomControl: JSDomControl,
63
  protected snapshotFormatter: SnapshotFormatter,
64
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
 
283
  continue;
284
  }
285
 
286
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
287
  chargeAmount = this.assignChargeAmount(formatted);
288
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
289
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
313
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
314
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
315
  lastScrapped = scrapped;
316
+ if (!crawlerOptions.isEarlyReturnApplicable()) {
317
+ continue;
318
+ }
319
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
320
  continue;
321
  }
322
 
323
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
324
  chargeAmount = this.assignChargeAmount(formatted);
325
 
326
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
327
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
328
  }
329
 
330
+ if (scrapped?.pdfs?.length && !chargeAmount) {
331
+ continue;
332
  }
333
 
334
+ return formatted;
 
 
335
  }
336
 
337
  if (!lastScrapped) {
 
341
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
342
  }
343
 
344
+ const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
345
  chargeAmount = this.assignChargeAmount(formatted);
346
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
347
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
359
 
360
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
361
  lastScrapped = scrapped;
362
+
363
+ if (!crawlerOptions.isEarlyReturnApplicable()) {
364
+ continue;
365
+ }
366
+
367
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
368
  continue;
369
  }
370
 
371
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
372
  chargeAmount = this.assignChargeAmount(formatted);
373
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
374
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
375
  }
376
 
377
+ if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
 
 
 
 
 
 
 
378
 
379
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
380
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
381
+ );
382
+ }
383
+ if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
384
 
385
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
386
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
387
+ );
388
  }
389
+
390
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
391
  }
392
 
393
  if (!lastScrapped) {
 
397
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
398
  }
399
 
400
+ const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
401
  chargeAmount = this.assignChargeAmount(formatted);
402
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
403
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
625
  return;
626
  }
627
 
628
+ if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
629
+ const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
630
+
631
+ yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
632
+
633
+ return;
634
+ }
635
+
636
  let cache;
637
 
638
  if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
 
779
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
780
  }
781
 
782
+ if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
783
+ crawlOpts.favorScreenshot = true;
784
+ }
785
+
786
  if (opts.injectFrameScript?.length) {
787
  crawlOpts.injectFrameScripts = (await Promise.all(
788
  opts.injectFrameScript.map((x) => {
 
810
  return crawlOpts;
811
  }
812
 
813
+ formatSnapshot(
814
+ crawlerOptions: CrawlerOptions,
815
+ snapshot: PageSnapshot & {
816
+ screenshotUrl?: string;
817
+ pageshotUrl?: string;
818
+ },
819
+ nominalUrl?: URL,
820
+ urlValidMs?: number
821
+ ) {
822
+ if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
823
+ const output: FormattedPage = {
824
+ title: snapshot.title,
825
+ content: snapshot.parsed?.textContent,
826
+ url: snapshot.href,
827
+ pageshotUrl: snapshot.pageshotUrl,
828
+ [Symbol.dispose]: () => undefined,
829
+ };
830
+
831
+ Object.defineProperty(output, 'textRepresentation', {
832
+ value: snapshot.parsed?.textContent,
833
+ enumerable: false,
834
+ });
835
+
836
+ return output;
837
+ }
838
+
839
+ return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
840
+ }
841
+
842
+ async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
843
+ const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
844
+
845
+ let lastSnapshot;
846
+ let lastError;
847
+ try {
848
+ for await (const x of it) {
849
+ lastSnapshot = x;
850
+ }
851
+ } catch (err) {
852
+ lastError = err;
853
+ }
854
+
855
+ if (!lastSnapshot && lastError) {
856
+ throw lastError;
857
+ }
858
+
859
+ if (!lastSnapshot) {
860
+ throw new AssertionFailureError(`No content available`);
861
+ }
862
+
863
+ return lastSnapshot;
864
+ }
865
+
866
  async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
867
  const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
868
 
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -461,6 +461,9 @@ export class CrawlerOptions extends AutoCastable {
461
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
462
  return false;
463
  }
 
 
 
464
 
465
  return true;
466
  }
 
461
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
462
  return false;
463
  }
464
+ if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
465
+ return false;
466
+ }
467
 
468
  return true;
469
  }
backend/functions/src/services/alt-text.ts CHANGED
@@ -33,9 +33,10 @@ export class AltTextService extends AsyncService {
33
  const resized = this.canvasService.fitImageToSquareBox(img, 1024);
34
  const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
35
 
36
- const r = await this.imageInterrogator.interrogate('blip2', {
37
  image: exported,
38
- // prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
 
39
  });
40
 
41
  return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
 
33
  const resized = this.canvasService.fitImageToSquareBox(img, 1024);
34
  const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
35
 
36
+ const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', {
37
  image: exported,
38
+ prompt: `Yield a concise image caption sentence in third person.`,
39
+ system: 'You are BLIP2, an image caption model.',
40
  });
41
 
42
  return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
backend/functions/src/services/vlm.ts ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { AsyncService } from 'civkit/async-service';
2
+ import { singleton } from 'tsyringe';
3
+
4
+ import { PageSnapshot } from './puppeteer';
5
+ import { Logger } from '../shared/services/logger';
6
+ import _ from 'lodash';
7
+ import { AssertionFailureError } from 'civkit';
8
+ import { LLMManager } from '../shared/services/common-llm';
9
+
10
+ @singleton()
11
+ export class VlmControl extends AsyncService {
12
+
13
+ logger = this.globalLogger.child({ service: this.constructor.name });
14
+
15
+ constructor(
16
+ protected globalLogger: Logger,
17
+ protected commonLLM: LLMManager
18
+ ) {
19
+ super(...arguments);
20
+ }
21
+
22
+ override async init() {
23
+ await this.dependencyReady();
24
+
25
+ this.emit('ready');
26
+ }
27
+
28
+ async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
29
+ pageshotUrl?: string,
30
+ }) {
31
+ const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
32
+
33
+ if (!pageshot) {
34
+ throw new AssertionFailureError('Screenshot of the page is not available');
35
+ }
36
+
37
+ const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
38
+ prompt: [
39
+ typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
40
+ `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
41
+ ],
42
+
43
+ options: {
44
+ system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
45
+ stream: true
46
+ }
47
+ });
48
+
49
+ const chunks: string[] = [];
50
+ for await (const txt of it) {
51
+ chunks.push(txt);
52
+ const output: PageSnapshot = {
53
+ ...snapshot,
54
+ parsed: {
55
+ ...snapshot?.parsed,
56
+ textContent: chunks.join(''),
57
+ }
58
+ };
59
+ yield output;
60
+ }
61
+
62
+ return;
63
+ }
64
+ }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
 
1
+ Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0