Spaces:
Build error
Build error
feat: new lm engine
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
|
|
| 23 |
import { JSDomControl } from '../services/jsdom';
|
| 24 |
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
| 25 |
import { CurlControl } from '../services/curl';
|
| 26 |
-
import {
|
| 27 |
|
| 28 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 29 |
withIframe?: boolean | 'quoted';
|
|
@@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 58 |
protected globalLogger: Logger,
|
| 59 |
protected puppeteerControl: PuppeteerControl,
|
| 60 |
protected curlControl: CurlControl,
|
| 61 |
-
protected
|
| 62 |
protected jsdomControl: JSDomControl,
|
| 63 |
protected snapshotFormatter: SnapshotFormatter,
|
| 64 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
|
@@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 284 |
}
|
| 285 |
|
| 286 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 287 |
-
chargeAmount = this.assignChargeAmount(formatted);
|
| 288 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 289 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 290 |
}
|
|
@@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 321 |
}
|
| 322 |
|
| 323 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 324 |
-
chargeAmount = this.assignChargeAmount(formatted);
|
| 325 |
|
| 326 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 327 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
@@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 342 |
}
|
| 343 |
|
| 344 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
| 345 |
-
chargeAmount = this.assignChargeAmount(formatted);
|
| 346 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 347 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 348 |
}
|
|
@@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 369 |
}
|
| 370 |
|
| 371 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 372 |
-
chargeAmount = this.assignChargeAmount(formatted);
|
| 373 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 374 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 375 |
}
|
|
@@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 398 |
}
|
| 399 |
|
| 400 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
| 401 |
-
chargeAmount = this.assignChargeAmount(formatted);
|
| 402 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 403 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 404 |
}
|
|
@@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost {
|
|
| 625 |
return;
|
| 626 |
}
|
| 627 |
|
| 628 |
-
if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
| 629 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
return;
|
| 634 |
}
|
|
@@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost {
|
|
| 669 |
}
|
| 670 |
}
|
| 671 |
|
| 672 |
-
assignChargeAmount(formatted: FormattedPage) {
|
| 673 |
if (!formatted) {
|
| 674 |
return 0;
|
| 675 |
}
|
| 676 |
|
| 677 |
let amount = 0;
|
| 678 |
if (formatted.content) {
|
| 679 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 680 |
} else if (formatted.description) {
|
| 681 |
amount += estimateToken(formatted.description);
|
| 682 |
}
|
|
@@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 819 |
nominalUrl?: URL,
|
| 820 |
urlValidMs?: number
|
| 821 |
) {
|
| 822 |
-
if (crawlerOptions.engine?.toLowerCase()
|
| 823 |
const output: FormattedPage = {
|
| 824 |
title: snapshot.title,
|
| 825 |
content: snapshot.parsed?.textContent,
|
|
|
|
| 23 |
import { JSDomControl } from '../services/jsdom';
|
| 24 |
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
| 25 |
import { CurlControl } from '../services/curl';
|
| 26 |
+
import { LmControl } from '../services/lm';
|
| 27 |
|
| 28 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 29 |
withIframe?: boolean | 'quoted';
|
|
|
|
| 58 |
protected globalLogger: Logger,
|
| 59 |
protected puppeteerControl: PuppeteerControl,
|
| 60 |
protected curlControl: CurlControl,
|
| 61 |
+
protected lmControl: LmControl,
|
| 62 |
protected jsdomControl: JSDomControl,
|
| 63 |
protected snapshotFormatter: SnapshotFormatter,
|
| 64 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 287 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 288 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 289 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 290 |
}
|
|
|
|
| 321 |
}
|
| 322 |
|
| 323 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 324 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 325 |
|
| 326 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 327 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
|
|
| 342 |
}
|
| 343 |
|
| 344 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
| 345 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 346 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 347 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 348 |
}
|
|
|
|
| 369 |
}
|
| 370 |
|
| 371 |
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 372 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 373 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 374 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 375 |
}
|
|
|
|
| 398 |
}
|
| 399 |
|
| 400 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
| 401 |
+
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 402 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 403 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 404 |
}
|
|
|
|
| 625 |
return;
|
| 626 |
}
|
| 627 |
|
| 628 |
+
// if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
| 629 |
+
// const rmSelectorEquivalent = [];
|
| 630 |
+
// if (typeof crawlOpts.removeSelector === 'string') {
|
| 631 |
+
// rmSelectorEquivalent.push(crawlOpts.removeSelector);
|
| 632 |
+
// } else if (Array.isArray(crawlOpts.removeSelector)) {
|
| 633 |
+
// rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
| 634 |
+
// }
|
| 635 |
+
// rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
|
| 636 |
|
| 637 |
+
// const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 638 |
+
// ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
|
| 639 |
+
// }, crawlerOpts);
|
| 640 |
+
|
| 641 |
+
// yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
|
| 642 |
+
|
| 643 |
+
// return;
|
| 644 |
+
// }
|
| 645 |
+
|
| 646 |
+
if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
|
| 647 |
+
const rmSelectorEquivalent = [];
|
| 648 |
+
if (typeof crawlOpts.removeSelector === 'string') {
|
| 649 |
+
rmSelectorEquivalent.push(crawlOpts.removeSelector);
|
| 650 |
+
} else if (Array.isArray(crawlOpts.removeSelector)) {
|
| 651 |
+
rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
| 652 |
+
}
|
| 653 |
+
rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
|
| 654 |
+
|
| 655 |
+
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 656 |
+
...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
|
| 657 |
+
}, crawlerOpts);
|
| 658 |
+
|
| 659 |
+
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
| 660 |
+
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
| 661 |
+
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
| 662 |
+
|
| 663 |
+
return;
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
|
| 667 |
|
| 668 |
return;
|
| 669 |
}
|
|
|
|
| 704 |
}
|
| 705 |
}
|
| 706 |
|
| 707 |
+
assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
|
| 708 |
if (!formatted) {
|
| 709 |
return 0;
|
| 710 |
}
|
| 711 |
|
| 712 |
let amount = 0;
|
| 713 |
if (formatted.content) {
|
| 714 |
+
const x1 = estimateToken(formatted.content);
|
| 715 |
+
if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
|
| 716 |
+
amount += x1 * 2;
|
| 717 |
+
}
|
| 718 |
+
amount += x1;
|
| 719 |
} else if (formatted.description) {
|
| 720 |
amount += estimateToken(formatted.description);
|
| 721 |
}
|
|
|
|
| 858 |
nominalUrl?: URL,
|
| 859 |
urlValidMs?: number
|
| 860 |
) {
|
| 861 |
+
if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
|
| 862 |
const output: FormattedPage = {
|
| 863 |
title: snapshot.title,
|
| 864 |
content: snapshot.parsed?.textContent,
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -15,6 +15,7 @@ export enum ENGINE_TYPE {
|
|
| 15 |
BROWSER = 'browser',
|
| 16 |
DIRECT = 'direct',
|
| 17 |
VLM = 'vlm',
|
|
|
|
| 18 |
}
|
| 19 |
|
| 20 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
|
@@ -188,7 +189,7 @@ class Viewport extends AutoCastable {
|
|
| 188 |
schema: { type: 'string' }
|
| 189 |
},
|
| 190 |
'X-Engine': {
|
| 191 |
-
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
|
| 192 |
in: 'header',
|
| 193 |
schema: { type: 'string' }
|
| 194 |
},
|
|
@@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 317 |
@Prop()
|
| 318 |
viewport?: Viewport;
|
| 319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
static override from(input: any) {
|
| 321 |
const instance = super.from(input) as CrawlerOptions;
|
| 322 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
|
@@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 461 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 462 |
return false;
|
| 463 |
}
|
| 464 |
-
if (this.engine?.toLowerCase()
|
| 465 |
return false;
|
| 466 |
}
|
| 467 |
|
|
|
|
| 15 |
BROWSER = 'browser',
|
| 16 |
DIRECT = 'direct',
|
| 17 |
VLM = 'vlm',
|
| 18 |
+
READER_LM = 'readerlm-v2',
|
| 19 |
}
|
| 20 |
|
| 21 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
|
|
|
| 189 |
schema: { type: 'string' }
|
| 190 |
},
|
| 191 |
'X-Engine': {
|
| 192 |
+
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2',
|
| 193 |
in: 'header',
|
| 194 |
schema: { type: 'string' }
|
| 195 |
},
|
|
|
|
| 318 |
@Prop()
|
| 319 |
viewport?: Viewport;
|
| 320 |
|
| 321 |
+
@Prop()
|
| 322 |
+
instruction?: string;
|
| 323 |
+
|
| 324 |
+
@Prop()
|
| 325 |
+
jsonSchema?: object;
|
| 326 |
+
|
| 327 |
static override from(input: any) {
|
| 328 |
const instance = super.from(input) as CrawlerOptions;
|
| 329 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
|
|
|
| 468 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 469 |
return false;
|
| 470 |
}
|
| 471 |
+
if (this.engine?.toLowerCase().includes('lm')) {
|
| 472 |
return false;
|
| 473 |
}
|
| 474 |
|
backend/functions/src/services/{vlm.ts → lm.ts}
RENAMED
|
@@ -7,8 +7,10 @@ import _ from 'lodash';
|
|
| 7 |
import { AssertionFailureError } from 'civkit';
|
| 8 |
import { LLMManager } from '../shared/services/common-llm';
|
| 9 |
|
|
|
|
|
|
|
| 10 |
@singleton()
|
| 11 |
-
export class
|
| 12 |
|
| 13 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 14 |
|
|
@@ -25,7 +27,14 @@ export class VlmControl extends AsyncService {
|
|
| 25 |
this.emit('ready');
|
| 26 |
}
|
| 27 |
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
pageshotUrl?: string,
|
| 30 |
}) {
|
| 31 |
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
|
@@ -36,12 +45,70 @@ export class VlmControl extends AsyncService {
|
|
| 36 |
|
| 37 |
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
| 38 |
prompt: [
|
|
|
|
| 39 |
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
| 40 |
-
`Convert this webpage
|
| 41 |
],
|
| 42 |
|
| 43 |
options: {
|
| 44 |
-
system: 'You are
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
stream: true
|
| 46 |
}
|
| 47 |
});
|
|
|
|
| 7 |
import { AssertionFailureError } from 'civkit';
|
| 8 |
import { LLMManager } from '../shared/services/common-llm';
|
| 9 |
|
| 10 |
+
const tripleBackTick = '```';
|
| 11 |
+
|
| 12 |
@singleton()
|
| 13 |
+
export class LmControl extends AsyncService {
|
| 14 |
|
| 15 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 16 |
|
|
|
|
| 27 |
this.emit('ready');
|
| 28 |
}
|
| 29 |
|
| 30 |
+
cleanRedundantEmptyLines(text: string) {
|
| 31 |
+
const lines = text.split(/\r?\n/g);
|
| 32 |
+
const mappedFlag = lines.map((line) => Boolean(line.trim()));
|
| 33 |
+
|
| 34 |
+
return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
|
| 38 |
pageshotUrl?: string,
|
| 39 |
}) {
|
| 40 |
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
|
|
|
| 45 |
|
| 46 |
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
| 47 |
prompt: [
|
| 48 |
+
`HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
|
| 49 |
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
| 50 |
+
`Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
| 51 |
],
|
| 52 |
|
| 53 |
options: {
|
| 54 |
+
system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
|
| 55 |
+
stream: true
|
| 56 |
+
}
|
| 57 |
+
});
|
| 58 |
+
|
| 59 |
+
const chunks: string[] = [];
|
| 60 |
+
for await (const txt of it) {
|
| 61 |
+
chunks.push(txt);
|
| 62 |
+
const output: PageSnapshot = {
|
| 63 |
+
...snapshot,
|
| 64 |
+
parsed: {
|
| 65 |
+
...snapshot?.parsed,
|
| 66 |
+
textContent: chunks.join(''),
|
| 67 |
+
}
|
| 68 |
+
};
|
| 69 |
+
yield output;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
return;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
|
| 76 |
+
if (!snapshot) {
|
| 77 |
+
throw new AssertionFailureError('Snapshot of the page is not available');
|
| 78 |
+
}
|
| 79 |
+
const it = this.commonLLM.iterRun('readerlm-v2', {
|
| 80 |
+
prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
|
| 81 |
+
|
| 82 |
+
options: {
|
| 83 |
+
// system: 'You are an AI assistant developed by Jina AI',
|
| 84 |
+
stream: true
|
| 85 |
+
}
|
| 86 |
+
});
|
| 87 |
+
|
| 88 |
+
const chunks: string[] = [];
|
| 89 |
+
for await (const txt of it) {
|
| 90 |
+
chunks.push(txt);
|
| 91 |
+
const output: PageSnapshot = {
|
| 92 |
+
...snapshot,
|
| 93 |
+
parsed: {
|
| 94 |
+
...snapshot?.parsed,
|
| 95 |
+
textContent: chunks.join(''),
|
| 96 |
+
}
|
| 97 |
+
};
|
| 98 |
+
yield output;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
return;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
|
| 105 |
+
if (!snapshot) {
|
| 106 |
+
throw new AssertionFailureError('Snapshot of the page is not available');
|
| 107 |
+
}
|
| 108 |
+
const it = this.commonLLM.iterRun('readerlm-v2', {
|
| 109 |
+
prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
|
| 110 |
+
options: {
|
| 111 |
+
// system: 'You are an AI assistant developed by Jina AI',
|
| 112 |
stream: true
|
| 113 |
}
|
| 114 |
});
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f
|