Spaces:
Build error
Build error
fix: lm and related options
Browse files- backend/functions/package-lock.json +5 -4
- backend/functions/package.json +1 -1
- backend/functions/src/cloud-functions/crawler.ts +60 -83
- backend/functions/src/dto/scrapping-options.ts +51 -6
- backend/functions/src/services/curl.ts +2 -2
- backend/functions/src/services/jsdom.ts +57 -12
- backend/functions/src/services/lm.ts +14 -11
- backend/functions/src/services/puppeteer.ts +1 -1
- backend/functions/src/services/snapshot-formatter.ts +31 -8
- backend/functions/src/utils/tailwind-classes.ts +0 -0
backend/functions/package-lock.json
CHANGED
|
@@ -16,7 +16,7 @@
|
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
"busboy": "^1.6.0",
|
| 19 |
-
"civkit": "^0.8.2-
|
| 20 |
"core-js": "^3.37.1",
|
| 21 |
"cors": "^2.8.5",
|
| 22 |
"dayjs": "^1.11.9",
|
|
@@ -3979,9 +3979,10 @@
|
|
| 3979 |
}
|
| 3980 |
},
|
| 3981 |
"node_modules/civkit": {
|
| 3982 |
-
"version": "0.8.2-
|
| 3983 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-
|
| 3984 |
-
"integrity": "sha512-
|
|
|
|
| 3985 |
"dependencies": {
|
| 3986 |
"lodash": "^4.17.21",
|
| 3987 |
"tslib": "^2.5.0"
|
|
|
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
"busboy": "^1.6.0",
|
| 19 |
+
"civkit": "^0.8.2-4c0357a",
|
| 20 |
"core-js": "^3.37.1",
|
| 21 |
"cors": "^2.8.5",
|
| 22 |
"dayjs": "^1.11.9",
|
|
|
|
| 3979 |
}
|
| 3980 |
},
|
| 3981 |
"node_modules/civkit": {
|
| 3982 |
+
"version": "0.8.2-4c0357a",
|
| 3983 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz",
|
| 3984 |
+
"integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==",
|
| 3985 |
+
"license": "AGPL",
|
| 3986 |
"dependencies": {
|
| 3987 |
"lodash": "^4.17.21",
|
| 3988 |
"tslib": "^2.5.0"
|
backend/functions/package.json
CHANGED
|
@@ -36,7 +36,7 @@
|
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
"busboy": "^1.6.0",
|
| 39 |
-
"civkit": "^0.8.2-
|
| 40 |
"core-js": "^3.37.1",
|
| 41 |
"cors": "^2.8.5",
|
| 42 |
"dayjs": "^1.11.9",
|
|
|
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
"busboy": "^1.6.0",
|
| 39 |
+
"civkit": "^0.8.2-4c0357a",
|
| 40 |
"core-js": "^3.37.1",
|
| 41 |
"cors": "^2.8.5",
|
| 42 |
"dayjs": "^1.11.9",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -15,7 +15,7 @@ import { randomUUID } from 'crypto';
|
|
| 15 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 16 |
|
| 17 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 18 |
-
import { CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
|
| 19 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 20 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 21 |
import { DomainProfile } from '../db/domain-profile';
|
|
@@ -84,14 +84,6 @@ export class CrawlerHost extends RPCHost {
|
|
| 84 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 85 |
}
|
| 86 |
await this.setToCache(options.url, snapshot);
|
| 87 |
-
|
| 88 |
-
if (!options.engine) {
|
| 89 |
-
try {
|
| 90 |
-
await this.exploreDirectEngine(options.url, options, snapshot);
|
| 91 |
-
} catch (err) {
|
| 92 |
-
this.logger.warn(`Failed to explore direct engine option for ${options.url.href}`, { err });
|
| 93 |
-
}
|
| 94 |
-
}
|
| 95 |
});
|
| 96 |
|
| 97 |
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
|
@@ -152,8 +144,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 152 |
memory: '4GiB',
|
| 153 |
cpu: 2,
|
| 154 |
timeoutSeconds: 300,
|
| 155 |
-
concurrency:
|
| 156 |
-
maxInstances:
|
| 157 |
minInstances: 1,
|
| 158 |
},
|
| 159 |
tags: ['Crawler'],
|
|
@@ -260,25 +252,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 260 |
|
| 261 |
|
| 262 |
const crawlOpts = await this.configure(crawlerOptions);
|
| 263 |
-
|
| 264 |
-
if (!crawlOpts.engine) {
|
| 265 |
-
const domainProfile = (await DomainProfile.fromFirestoreQuery(
|
| 266 |
-
DomainProfile.COLLECTION
|
| 267 |
-
.where('origin', '==', targetUrl.origin.toLowerCase())
|
| 268 |
-
.limit(1)
|
| 269 |
-
))[0];
|
| 270 |
-
|
| 271 |
-
if (domainProfile?.engine) {
|
| 272 |
-
crawlOpts.engine = domainProfile.engine;
|
| 273 |
-
}
|
| 274 |
-
}
|
| 275 |
-
|
| 276 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 277 |
const sseStream = new OutputServerEventStream();
|
| 278 |
rpcReflect.return(sseStream);
|
| 279 |
|
| 280 |
try {
|
| 281 |
-
for await (const scrapped of this.
|
| 282 |
if (!scrapped) {
|
| 283 |
continue;
|
| 284 |
}
|
|
@@ -311,7 +290,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 311 |
|
| 312 |
let lastScrapped;
|
| 313 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 314 |
-
for await (const scrapped of this.
|
| 315 |
lastScrapped = scrapped;
|
| 316 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
| 317 |
continue;
|
|
@@ -357,7 +336,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 357 |
});
|
| 358 |
}
|
| 359 |
|
| 360 |
-
for await (const scrapped of this.
|
| 361 |
lastScrapped = scrapped;
|
| 362 |
|
| 363 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
|
@@ -589,82 +568,78 @@ export class CrawlerHost extends RPCHost {
|
|
| 589 |
return r;
|
| 590 |
}
|
| 591 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
| 593 |
-
let overrideFinalSnapshot;
|
| 594 |
if (crawlerOpts?.html) {
|
| 595 |
-
|
| 596 |
href: urlToCrawl.toString(),
|
| 597 |
html: crawlerOpts.html,
|
| 598 |
title: '',
|
| 599 |
text: '',
|
| 600 |
} as PageSnapshot;
|
|
|
|
|
|
|
|
|
|
| 601 |
}
|
| 602 |
|
| 603 |
if (crawlerOpts?.pdf) {
|
| 604 |
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
| 605 |
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
| 606 |
-
|
| 607 |
href: urlToCrawl.toString(),
|
| 608 |
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
| 609 |
title: '',
|
| 610 |
text: '',
|
| 611 |
pdfs: [pdfDataUrl],
|
| 612 |
} as PageSnapshot;
|
| 613 |
-
}
|
| 614 |
|
| 615 |
-
|
| 616 |
-
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
| 617 |
|
| 618 |
return;
|
| 619 |
}
|
| 620 |
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
// rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
| 627 |
-
// }
|
| 628 |
-
// rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
|
| 629 |
-
|
| 630 |
-
// const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 631 |
-
// ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
|
| 632 |
-
// }, crawlerOpts);
|
| 633 |
-
|
| 634 |
-
// yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
|
| 635 |
-
|
| 636 |
-
// return;
|
| 637 |
-
// }
|
| 638 |
-
|
| 639 |
-
if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
|
| 640 |
-
const rmSelectorEquivalent = [];
|
| 641 |
-
if (typeof crawlOpts.removeSelector === 'string') {
|
| 642 |
-
rmSelectorEquivalent.push(crawlOpts.removeSelector);
|
| 643 |
-
} else if (Array.isArray(crawlOpts.removeSelector)) {
|
| 644 |
-
rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
| 645 |
-
}
|
| 646 |
-
rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
|
| 647 |
-
|
| 648 |
-
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 649 |
-
...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
|
| 650 |
-
}, crawlerOpts);
|
| 651 |
-
|
| 652 |
-
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
| 653 |
-
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
| 654 |
-
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
| 655 |
|
| 656 |
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
}
|
| 658 |
-
|
| 659 |
-
yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
|
| 660 |
-
|
| 661 |
-
return;
|
| 662 |
-
}
|
| 663 |
-
|
| 664 |
-
if (overrideFinalSnapshot) {
|
| 665 |
-
yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts);
|
| 666 |
-
|
| 667 |
-
return;
|
| 668 |
}
|
| 669 |
|
| 670 |
let cache;
|
|
@@ -857,12 +832,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 857 |
nominalUrl?: URL,
|
| 858 |
urlValidMs?: number
|
| 859 |
) {
|
| 860 |
-
const
|
| 861 |
-
|
|
|
|
|
|
|
| 862 |
const output: FormattedPage = {
|
| 863 |
title: snapshot.title,
|
| 864 |
content: snapshot.parsed?.textContent,
|
| 865 |
-
url: snapshot.href,
|
| 866 |
[Symbol.dispose]: () => undefined,
|
| 867 |
};
|
| 868 |
|
|
@@ -874,7 +851,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 874 |
return output;
|
| 875 |
}
|
| 876 |
|
| 877 |
-
return this.snapshotFormatter.formatSnapshot(
|
| 878 |
}
|
| 879 |
|
| 880 |
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
|
@@ -902,7 +879,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 902 |
}
|
| 903 |
|
| 904 |
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
| 905 |
-
const it = this.
|
| 906 |
|
| 907 |
let lastSnapshot;
|
| 908 |
let goodEnough = false;
|
|
@@ -936,7 +913,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 936 |
}
|
| 937 |
|
| 938 |
async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
|
| 939 |
-
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions);
|
| 940 |
|
| 941 |
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
| 942 |
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
|
|
|
|
| 15 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 16 |
|
| 17 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 18 |
+
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
|
| 19 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 20 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 21 |
import { DomainProfile } from '../db/domain-profile';
|
|
|
|
| 84 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 85 |
}
|
| 86 |
await this.setToCache(options.url, snapshot);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
});
|
| 88 |
|
| 89 |
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
|
|
|
| 144 |
memory: '4GiB',
|
| 145 |
cpu: 2,
|
| 146 |
timeoutSeconds: 300,
|
| 147 |
+
concurrency: 10,
|
| 148 |
+
maxInstances: 1000,
|
| 149 |
minInstances: 1,
|
| 150 |
},
|
| 151 |
tags: ['Crawler'],
|
|
|
|
| 252 |
|
| 253 |
|
| 254 |
const crawlOpts = await this.configure(crawlerOptions);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 256 |
const sseStream = new OutputServerEventStream();
|
| 257 |
rpcReflect.return(sseStream);
|
| 258 |
|
| 259 |
try {
|
| 260 |
+
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 261 |
if (!scrapped) {
|
| 262 |
continue;
|
| 263 |
}
|
|
|
|
| 290 |
|
| 291 |
let lastScrapped;
|
| 292 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 293 |
+
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 294 |
lastScrapped = scrapped;
|
| 295 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
| 296 |
continue;
|
|
|
|
| 336 |
});
|
| 337 |
}
|
| 338 |
|
| 339 |
+
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 340 |
lastScrapped = scrapped;
|
| 341 |
|
| 342 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
|
|
|
| 568 |
return r;
|
| 569 |
}
|
| 570 |
|
| 571 |
+
async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
| 572 |
+
// if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) {
|
| 573 |
+
// const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 574 |
+
// ...crawlOpts, engine: ENGINE_TYPE.BROWSER
|
| 575 |
+
// }, crawlerOpts);
|
| 576 |
+
|
| 577 |
+
// yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
|
| 578 |
+
|
| 579 |
+
// return;
|
| 580 |
+
// }
|
| 581 |
+
|
| 582 |
+
if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
|
| 583 |
+
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 584 |
+
...crawlOpts, engine: ENGINE_TYPE.AUTO
|
| 585 |
+
}, crawlerOpts);
|
| 586 |
+
|
| 587 |
+
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
| 588 |
+
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
| 589 |
+
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
| 590 |
+
|
| 591 |
+
return;
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
|
| 595 |
+
|
| 596 |
+
return;
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts);
|
| 600 |
+
}
|
| 601 |
+
|
| 602 |
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
|
|
|
| 603 |
if (crawlerOpts?.html) {
|
| 604 |
+
const snapshot = {
|
| 605 |
href: urlToCrawl.toString(),
|
| 606 |
html: crawlerOpts.html,
|
| 607 |
title: '',
|
| 608 |
text: '',
|
| 609 |
} as PageSnapshot;
|
| 610 |
+
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
| 611 |
+
|
| 612 |
+
return;
|
| 613 |
}
|
| 614 |
|
| 615 |
if (crawlerOpts?.pdf) {
|
| 616 |
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
| 617 |
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
| 618 |
+
const snapshot = {
|
| 619 |
href: urlToCrawl.toString(),
|
| 620 |
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
| 621 |
title: '',
|
| 622 |
text: '',
|
| 623 |
pdfs: [pdfDataUrl],
|
| 624 |
} as PageSnapshot;
|
|
|
|
| 625 |
|
| 626 |
+
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
|
|
|
| 627 |
|
| 628 |
return;
|
| 629 |
}
|
| 630 |
|
| 631 |
+
if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) {
|
| 632 |
+
const engine = crawlOpts?.engine;
|
| 633 |
+
try {
|
| 634 |
+
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
| 635 |
+
yield snapshot;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
return;
|
| 638 |
+
} catch (err) {
|
| 639 |
+
if (!engine.endsWith('?')) {
|
| 640 |
+
throw err;
|
| 641 |
+
}
|
| 642 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
}
|
| 644 |
|
| 645 |
let cache;
|
|
|
|
| 832 |
nominalUrl?: URL,
|
| 833 |
urlValidMs?: number
|
| 834 |
) {
|
| 835 |
+
const presumedURL = crawlerOptions.base === 'eventual' ? new URL(snapshot.href) : nominalUrl;
|
| 836 |
+
|
| 837 |
+
const respondWith = crawlerOptions.respondWith;
|
| 838 |
+
if (respondWith === CONTENT_FORMAT.READER_LM || respondWith === CONTENT_FORMAT.VLM) {
|
| 839 |
const output: FormattedPage = {
|
| 840 |
title: snapshot.title,
|
| 841 |
content: snapshot.parsed?.textContent,
|
| 842 |
+
url: presumedURL?.href || snapshot.href,
|
| 843 |
[Symbol.dispose]: () => undefined,
|
| 844 |
};
|
| 845 |
|
|
|
|
| 851 |
return output;
|
| 852 |
}
|
| 853 |
|
| 854 |
+
return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs);
|
| 855 |
}
|
| 856 |
|
| 857 |
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
|
|
|
| 879 |
}
|
| 880 |
|
| 881 |
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
| 882 |
+
const it = this.iterSnapshots(url, { ...opts, minIntervalMs: 500 });
|
| 883 |
|
| 884 |
let lastSnapshot;
|
| 885 |
let goodEnough = false;
|
|
|
|
| 913 |
}
|
| 914 |
|
| 915 |
async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
|
| 916 |
+
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true);
|
| 917 |
|
| 918 |
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
| 919 |
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
| 2 |
import type { Request, Response } from 'express';
|
| 3 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 4 |
|
|
@@ -9,9 +9,12 @@ export enum CONTENT_FORMAT {
|
|
| 9 |
TEXT = 'text',
|
| 10 |
PAGESHOT = 'pageshot',
|
| 11 |
SCREENSHOT = 'screenshot',
|
|
|
|
|
|
|
| 12 |
}
|
| 13 |
|
| 14 |
export enum ENGINE_TYPE {
|
|
|
|
| 15 |
BROWSER = 'browser',
|
| 16 |
DIRECT = 'direct',
|
| 17 |
VLM = 'vlm',
|
|
@@ -22,6 +25,8 @@ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
|
| 22 |
|
| 23 |
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
| 24 |
const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
|
|
|
|
|
|
|
| 25 |
|
| 26 |
class Viewport extends AutoCastable {
|
| 27 |
@Prop({
|
|
@@ -193,6 +198,11 @@ class Viewport extends AutoCastable {
|
|
| 193 |
in: 'header',
|
| 194 |
schema: { type: 'string' }
|
| 195 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
}
|
| 197 |
}
|
| 198 |
}
|
|
@@ -205,6 +215,12 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 205 |
@Prop()
|
| 206 |
html?: string;
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
@Prop({
|
| 209 |
desc: 'Base64 encoded PDF.',
|
| 210 |
type: [File, String]
|
|
@@ -228,7 +244,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 228 |
@Prop({
|
| 229 |
default: false,
|
| 230 |
})
|
| 231 |
-
withLinksSummary!: boolean;
|
| 232 |
|
| 233 |
@Prop({
|
| 234 |
default: false,
|
|
@@ -335,6 +351,17 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 335 |
if (customMode !== undefined) {
|
| 336 |
instance.respondWith = customMode;
|
| 337 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
const locale = ctx?.req.get('x-locale');
|
| 340 |
if (locale !== undefined) {
|
|
@@ -352,7 +379,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 352 |
}
|
| 353 |
const withLinksSummary = ctx?.req.get('x-with-links-summary');
|
| 354 |
if (withLinksSummary !== undefined) {
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
}
|
| 357 |
const withImagesSummary = ctx?.req.get('x-with-images-summary');
|
| 358 |
if (withImagesSummary !== undefined) {
|
|
@@ -403,8 +434,15 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 403 |
if (engine) {
|
| 404 |
instance.engine = engine;
|
| 405 |
}
|
| 406 |
-
if (instance.
|
| 407 |
-
instance.engine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
}
|
| 409 |
|
| 410 |
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
|
|
@@ -451,10 +489,17 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 451 |
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
|
| 452 |
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
| 453 |
|
|
|
|
|
|
|
|
|
|
| 454 |
if (instance.cacheTolerance) {
|
| 455 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 456 |
}
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
return instance;
|
| 459 |
}
|
| 460 |
|
|
@@ -468,7 +513,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 468 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 469 |
return false;
|
| 470 |
}
|
| 471 |
-
if (this.
|
| 472 |
return false;
|
| 473 |
}
|
| 474 |
|
|
|
|
| 1 |
+
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
| 2 |
import type { Request, Response } from 'express';
|
| 3 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 4 |
|
|
|
|
| 9 |
TEXT = 'text',
|
| 10 |
PAGESHOT = 'pageshot',
|
| 11 |
SCREENSHOT = 'screenshot',
|
| 12 |
+
VLM = 'vlm',
|
| 13 |
+
READER_LM = 'readerlm-v2',
|
| 14 |
}
|
| 15 |
|
| 16 |
export enum ENGINE_TYPE {
|
| 17 |
+
AUTO = 'auto',
|
| 18 |
BROWSER = 'browser',
|
| 19 |
DIRECT = 'direct',
|
| 20 |
VLM = 'vlm',
|
|
|
|
| 25 |
|
| 26 |
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
| 27 |
const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
|
| 28 |
+
export const BASE_URL_MODES = ['initial', 'eventual'] as const;
|
| 29 |
+
const BASE_URL_MODE_VALUES = new Set<string>(BASE_URL_MODES);
|
| 30 |
|
| 31 |
class Viewport extends AutoCastable {
|
| 32 |
@Prop({
|
|
|
|
| 198 |
in: 'header',
|
| 199 |
schema: { type: 'string' }
|
| 200 |
},
|
| 201 |
+
'X-Base': {
|
| 202 |
+
description: 'Select base modes of relative URLs.\n\nSupported: initial, eventual',
|
| 203 |
+
in: 'header',
|
| 204 |
+
schema: { type: 'string' }
|
| 205 |
+
},
|
| 206 |
}
|
| 207 |
}
|
| 208 |
}
|
|
|
|
| 215 |
@Prop()
|
| 216 |
html?: string;
|
| 217 |
|
| 218 |
+
@Prop({
|
| 219 |
+
type: BASE_URL_MODE_VALUES,
|
| 220 |
+
default: 'initial',
|
| 221 |
+
})
|
| 222 |
+
base?: typeof BASE_URL_MODES[number];
|
| 223 |
+
|
| 224 |
@Prop({
|
| 225 |
desc: 'Base64 encoded PDF.',
|
| 226 |
type: [File, String]
|
|
|
|
| 244 |
@Prop({
|
| 245 |
default: false,
|
| 246 |
})
|
| 247 |
+
withLinksSummary!: boolean | string;
|
| 248 |
|
| 249 |
@Prop({
|
| 250 |
default: false,
|
|
|
|
| 351 |
if (customMode !== undefined) {
|
| 352 |
instance.respondWith = customMode;
|
| 353 |
}
|
| 354 |
+
if (instance.respondWith) {
|
| 355 |
+
instance.respondWith = instance.respondWith.toLowerCase();
|
| 356 |
+
}
|
| 357 |
+
if (instance.respondWith?.includes('lm')) {
|
| 358 |
+
if (instance.respondWith.includes('content') || instance.respondWith.includes('markdown')) {
|
| 359 |
+
throw new ParamValidationError({
|
| 360 |
+
path: 'respondWith',
|
| 361 |
+
message: `LM formats conflicts with content/markdown.`,
|
| 362 |
+
});
|
| 363 |
+
}
|
| 364 |
+
}
|
| 365 |
|
| 366 |
const locale = ctx?.req.get('x-locale');
|
| 367 |
if (locale !== undefined) {
|
|
|
|
| 379 |
}
|
| 380 |
const withLinksSummary = ctx?.req.get('x-with-links-summary');
|
| 381 |
if (withLinksSummary !== undefined) {
|
| 382 |
+
if (withLinksSummary === 'all') {
|
| 383 |
+
instance.withLinksSummary = withLinksSummary;
|
| 384 |
+
} else {
|
| 385 |
+
instance.withLinksSummary = Boolean(withLinksSummary);
|
| 386 |
+
}
|
| 387 |
}
|
| 388 |
const withImagesSummary = ctx?.req.get('x-with-images-summary');
|
| 389 |
if (withImagesSummary !== undefined) {
|
|
|
|
| 434 |
if (engine) {
|
| 435 |
instance.engine = engine;
|
| 436 |
}
|
| 437 |
+
if (instance.engine) {
|
| 438 |
+
instance.engine = instance.engine.toLowerCase();
|
| 439 |
+
}
|
| 440 |
+
if (instance.engine === ENGINE_TYPE.VLM) {
|
| 441 |
+
instance.engine = ENGINE_TYPE.BROWSER;
|
| 442 |
+
instance.respondWith = CONTENT_FORMAT.VLM;
|
| 443 |
+
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
|
| 444 |
+
instance.engine = undefined;
|
| 445 |
+
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
| 446 |
}
|
| 447 |
|
| 448 |
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
|
|
|
|
| 489 |
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
|
| 490 |
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
| 491 |
|
| 492 |
+
const baseMode = ctx?.req.get('x-base') || undefined;
|
| 493 |
+
instance.base ??= baseMode as any;
|
| 494 |
+
|
| 495 |
if (instance.cacheTolerance) {
|
| 496 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 497 |
}
|
| 498 |
|
| 499 |
+
if (instance.noCache || !instance.isTypicalRequest()) {
|
| 500 |
+
instance.engine ??= ENGINE_TYPE.BROWSER + '?';
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
return instance;
|
| 504 |
}
|
| 505 |
|
|
|
|
| 513 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 514 |
return false;
|
| 515 |
}
|
| 516 |
+
if (this.respondWith.includes('lm')) {
|
| 517 |
return false;
|
| 518 |
}
|
| 519 |
|
backend/functions/src/services/curl.ts
CHANGED
|
@@ -26,7 +26,7 @@ export class CurlControl extends AsyncService {
|
|
| 26 |
this.emit('ready');
|
| 27 |
}
|
| 28 |
|
| 29 |
-
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
|
| 30 |
const result = await new Promise<{
|
| 31 |
statusCode: number,
|
| 32 |
data: string,
|
|
@@ -75,7 +75,7 @@ export class CurlControl extends AsyncService {
|
|
| 75 |
curl.perform();
|
| 76 |
});
|
| 77 |
|
| 78 |
-
if (result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
| 79 |
throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
| 80 |
}
|
| 81 |
|
|
|
|
| 26 |
this.emit('ready');
|
| 27 |
}
|
| 28 |
|
| 29 |
+
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
|
| 30 |
const result = await new Promise<{
|
| 31 |
statusCode: number,
|
| 32 |
data: string,
|
|
|
|
| 75 |
curl.perform();
|
| 76 |
});
|
| 77 |
|
| 78 |
+
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
| 79 |
throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
| 80 |
}
|
| 81 |
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -6,6 +6,7 @@ import { Readability } from '@mozilla/readability';
|
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../shared/services/threaded';
|
| 8 |
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
|
|
|
|
| 9 |
|
| 10 |
const pLinkedom = import('linkedom');
|
| 11 |
|
|
@@ -184,26 +185,20 @@ export class JSDomControl extends AsyncService {
|
|
| 184 |
|
| 185 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 186 |
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
| 187 |
-
.map((x: any) => [x.
|
| 188 |
-
.map(([
|
| 189 |
-
if (!
|
| 190 |
return undefined;
|
| 191 |
}
|
| 192 |
try {
|
| 193 |
const parsed = new URL(href, snapshot.rebase || snapshot.href);
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
}
|
| 197 |
-
return [parsed.toString(), text] as const;
|
| 198 |
} catch (err) {
|
| 199 |
return undefined;
|
| 200 |
}
|
| 201 |
})
|
| 202 |
-
.filter(Boolean)
|
| 203 |
-
.reduce((acc, pair) => {
|
| 204 |
-
acc[pair![0]] = pair![1];
|
| 205 |
-
return acc;
|
| 206 |
-
}, {} as { [k: string]: string; });
|
| 207 |
|
| 208 |
extendedSnapshot.links = links;
|
| 209 |
|
|
@@ -237,6 +232,56 @@ export class JSDomControl extends AsyncService {
|
|
| 237 |
|
| 238 |
return extendedSnapshot;
|
| 239 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
snippetToElement(snippet?: string, url?: string) {
|
| 241 |
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
| 242 |
|
|
|
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../shared/services/threaded';
|
| 8 |
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
|
| 9 |
+
import { tailwindClasses } from '../utils/tailwind-classes';
|
| 10 |
|
| 11 |
const pLinkedom = import('linkedom');
|
| 12 |
|
|
|
|
| 185 |
|
| 186 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 187 |
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
| 188 |
+
.map((x: any) => [x.textContent.replace(/\s+/g, ' ').trim(), x.getAttribute('href'),])
|
| 189 |
+
.map(([text, href]) => {
|
| 190 |
+
if (!href) {
|
| 191 |
return undefined;
|
| 192 |
}
|
| 193 |
try {
|
| 194 |
const parsed = new URL(href, snapshot.rebase || snapshot.href);
|
| 195 |
+
|
| 196 |
+
return [text, parsed.toString()] as const;
|
|
|
|
|
|
|
| 197 |
} catch (err) {
|
| 198 |
return undefined;
|
| 199 |
}
|
| 200 |
})
|
| 201 |
+
.filter(Boolean) as [string, string][];
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
extendedSnapshot.links = links;
|
| 204 |
|
|
|
|
| 232 |
|
| 233 |
return extendedSnapshot;
|
| 234 |
}
|
| 235 |
+
|
| 236 |
+
cleanRedundantEmptyLines(text: string) {
|
| 237 |
+
const lines = text.split(/\r?\n/g);
|
| 238 |
+
const mappedFlag = lines.map((line) => Boolean(line.trim()));
|
| 239 |
+
|
| 240 |
+
return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
@Threaded()
|
| 244 |
+
async cleanHTMLforLMs(sourceHTML: string, ...discardSelectors: string[]): Promise<string> {
|
| 245 |
+
const t0 = Date.now();
|
| 246 |
+
let jsdom = this.linkedom.parseHTML(sourceHTML);
|
| 247 |
+
if (!jsdom.window.document.documentElement) {
|
| 248 |
+
jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`);
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
for (const rl of discardSelectors) {
|
| 252 |
+
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
jsdom.window.document.querySelectorAll('img[src],img[data-src]').forEach((x) => {
|
| 256 |
+
const src = x.getAttribute('src') || x.getAttribute('data-src');
|
| 257 |
+
if (src?.startsWith('data:')) {
|
| 258 |
+
x.setAttribute('src', 'blob:opaque');
|
| 259 |
+
}
|
| 260 |
+
x.removeAttribute('data-src');
|
| 261 |
+
x.removeAttribute('srcset');
|
| 262 |
+
});
|
| 263 |
+
|
| 264 |
+
jsdom.window.document.querySelectorAll('[class]').forEach((x) => {
|
| 265 |
+
const classes = x.getAttribute('class')?.split(/\s+/g) || [];
|
| 266 |
+
const newClasses = classes.filter((c) => tailwindClasses.has(c));
|
| 267 |
+
x.setAttribute('class', newClasses.join(' '));
|
| 268 |
+
});
|
| 269 |
+
jsdom.window.document.querySelectorAll('[style]').forEach((x) => {
|
| 270 |
+
const style = x.getAttribute('style')?.toLocaleLowerCase() || '';
|
| 271 |
+
if (style.startsWith('display: none')) {
|
| 272 |
+
return;
|
| 273 |
+
}
|
| 274 |
+
x.removeAttribute('style');
|
| 275 |
+
});
|
| 276 |
+
|
| 277 |
+
const dt = Date.now() - t0;
|
| 278 |
+
if (dt > 1000) {
|
| 279 |
+
this.logger.warn(`Performance issue: Cleaning HTML for LMs took ${dt}ms`, { dt });
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
return this.cleanRedundantEmptyLines(jsdom.window.document.documentElement.outerHTML);
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
snippetToElement(snippet?: string, url?: string) {
|
| 286 |
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
| 287 |
|
backend/functions/src/services/lm.ts
CHANGED
|
@@ -6,6 +6,7 @@ import { Logger } from '../shared/services/logger';
|
|
| 6 |
import _ from 'lodash';
|
| 7 |
import { AssertionFailureError } from 'civkit';
|
| 8 |
import { LLMManager } from '../shared/services/common-llm';
|
|
|
|
| 9 |
|
| 10 |
const tripleBackTick = '```';
|
| 11 |
|
|
@@ -16,7 +17,8 @@ export class LmControl extends AsyncService {
|
|
| 16 |
|
| 17 |
constructor(
|
| 18 |
protected globalLogger: Logger,
|
| 19 |
-
protected commonLLM: LLMManager
|
|
|
|
| 20 |
) {
|
| 21 |
super(...arguments);
|
| 22 |
}
|
|
@@ -27,13 +29,6 @@ export class LmControl extends AsyncService {
|
|
| 27 |
this.emit('ready');
|
| 28 |
}
|
| 29 |
|
| 30 |
-
cleanRedundantEmptyLines(text: string) {
|
| 31 |
-
const lines = text.split(/\r?\n/g);
|
| 32 |
-
const mappedFlag = lines.map((line) => Boolean(line.trim()));
|
| 33 |
-
|
| 34 |
-
return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
|
| 35 |
-
}
|
| 36 |
-
|
| 37 |
async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
|
| 38 |
pageshotUrl?: string,
|
| 39 |
}) {
|
|
@@ -43,9 +38,11 @@ export class LmControl extends AsyncService {
|
|
| 43 |
throw new AssertionFailureError('Screenshot of the page is not available');
|
| 44 |
}
|
| 45 |
|
|
|
|
|
|
|
| 46 |
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
| 47 |
prompt: [
|
| 48 |
-
`HTML: \n${
|
| 49 |
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
| 50 |
`Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
| 51 |
],
|
|
@@ -76,8 +73,11 @@ export class LmControl extends AsyncService {
|
|
| 76 |
if (!snapshot) {
|
| 77 |
throw new AssertionFailureError('Snapshot of the page is not available');
|
| 78 |
}
|
|
|
|
|
|
|
|
|
|
| 79 |
const it = this.commonLLM.iterRun('readerlm-v2', {
|
| 80 |
-
prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${
|
| 81 |
|
| 82 |
options: {
|
| 83 |
// system: 'You are an AI assistant developed by Jina AI',
|
|
@@ -105,8 +105,11 @@ export class LmControl extends AsyncService {
|
|
| 105 |
if (!snapshot) {
|
| 106 |
throw new AssertionFailureError('Snapshot of the page is not available');
|
| 107 |
}
|
|
|
|
|
|
|
|
|
|
| 108 |
const it = this.commonLLM.iterRun('readerlm-v2', {
|
| 109 |
-
prompt: `${instruction}\n\n${tripleBackTick}html\n${
|
| 110 |
options: {
|
| 111 |
// system: 'You are an AI assistant developed by Jina AI',
|
| 112 |
stream: true
|
|
|
|
| 6 |
import _ from 'lodash';
|
| 7 |
import { AssertionFailureError } from 'civkit';
|
| 8 |
import { LLMManager } from '../shared/services/common-llm';
|
| 9 |
+
import { JSDomControl } from './jsdom';
|
| 10 |
|
| 11 |
const tripleBackTick = '```';
|
| 12 |
|
|
|
|
| 17 |
|
| 18 |
constructor(
|
| 19 |
protected globalLogger: Logger,
|
| 20 |
+
protected commonLLM: LLMManager,
|
| 21 |
+
protected jsdomControl: JSDomControl,
|
| 22 |
) {
|
| 23 |
super(...arguments);
|
| 24 |
}
|
|
|
|
| 29 |
this.emit('ready');
|
| 30 |
}
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
|
| 33 |
pageshotUrl?: string,
|
| 34 |
}) {
|
|
|
|
| 38 |
throw new AssertionFailureError('Screenshot of the page is not available');
|
| 39 |
}
|
| 40 |
|
| 41 |
+
const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg')
|
| 42 |
+
|
| 43 |
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
| 44 |
prompt: [
|
| 45 |
+
`HTML: \n${html}\n\nSCREENSHOT: \n`,
|
| 46 |
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
| 47 |
`Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
| 48 |
],
|
|
|
|
| 73 |
if (!snapshot) {
|
| 74 |
throw new AssertionFailureError('Snapshot of the page is not available');
|
| 75 |
}
|
| 76 |
+
|
| 77 |
+
const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
|
| 78 |
+
|
| 79 |
const it = this.commonLLM.iterRun('readerlm-v2', {
|
| 80 |
+
prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n`,
|
| 81 |
|
| 82 |
options: {
|
| 83 |
// system: 'You are an AI assistant developed by Jina AI',
|
|
|
|
| 105 |
if (!snapshot) {
|
| 106 |
throw new AssertionFailureError('Snapshot of the page is not available');
|
| 107 |
}
|
| 108 |
+
|
| 109 |
+
const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
|
| 110 |
+
|
| 111 |
const it = this.commonLLM.iterRun('readerlm-v2', {
|
| 112 |
+
prompt: `${instruction}\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
|
| 113 |
options: {
|
| 114 |
// system: 'You are an AI assistant developed by Jina AI',
|
| 115 |
stream: true
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -63,7 +63,7 @@ export interface PageSnapshot {
|
|
| 63 |
}
|
| 64 |
|
| 65 |
export interface ExtendedSnapshot extends PageSnapshot {
|
| 66 |
-
links:
|
| 67 |
imgs: ImgBrief[];
|
| 68 |
}
|
| 69 |
|
|
|
|
| 63 |
}
|
| 64 |
|
| 65 |
export interface ExtendedSnapshot extends PageSnapshot {
|
| 66 |
+
links: [string, string][];
|
| 67 |
imgs: ImgBrief[];
|
| 68 |
}
|
| 69 |
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -28,8 +28,8 @@ export interface FormattedPage {
|
|
| 28 |
screenshot?: Buffer;
|
| 29 |
pageshotUrl?: string;
|
| 30 |
pageshot?: Buffer;
|
| 31 |
-
links?: { [k: string]: string; };
|
| 32 |
-
images?: { [k: string]: string; };
|
| 33 |
warning?: string;
|
| 34 |
usage?: {
|
| 35 |
total_tokens?: number;
|
|
@@ -56,7 +56,7 @@ export function highlightedCodeBlock(turndownService: TurndownService) {
|
|
| 56 |
highlightRegExp.test(node.className)
|
| 57 |
);
|
| 58 |
},
|
| 59 |
-
replacement: (_content, node, options)=> {
|
| 60 |
const className = (node as any).className || '';
|
| 61 |
const language = (className.match(highlightRegExp) || [null, ''])[1];
|
| 62 |
|
|
@@ -178,7 +178,14 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 178 |
Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true });
|
| 179 |
}
|
| 180 |
|
| 181 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
const dt = Date.now() - t0;
|
| 183 |
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 184 |
|
|
@@ -391,7 +398,13 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 391 |
.value();
|
| 392 |
}
|
| 393 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
}
|
| 396 |
|
| 397 |
Object.assign(f, formatted);
|
|
@@ -418,8 +431,14 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 418 |
}
|
| 419 |
if (this.links) {
|
| 420 |
const linkSummaryChunks = ['Links/Buttons:'];
|
| 421 |
-
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
}
|
| 424 |
if (linkSummaryChunks.length === 1) {
|
| 425 |
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
|
@@ -478,7 +497,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 478 |
}
|
| 479 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 480 |
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
}
|
| 483 |
if (snapshot.status) {
|
| 484 |
const code = snapshot.status;
|
|
|
|
| 28 |
screenshot?: Buffer;
|
| 29 |
pageshotUrl?: string;
|
| 30 |
pageshot?: Buffer;
|
| 31 |
+
links?: { [k: string]: string; } | [string, string][];
|
| 32 |
+
images?: { [k: string]: string; } | [string, string][];
|
| 33 |
warning?: string;
|
| 34 |
usage?: {
|
| 35 |
total_tokens?: number;
|
|
|
|
| 56 |
highlightRegExp.test(node.className)
|
| 57 |
);
|
| 58 |
},
|
| 59 |
+
replacement: (_content, node, options) => {
|
| 60 |
const className = (node as any).className || '';
|
| 61 |
const language = (className.match(highlightRegExp) || [null, ''])[1];
|
| 62 |
|
|
|
|
| 178 |
Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true });
|
| 179 |
}
|
| 180 |
|
| 181 |
+
if (mode.includes('lm')) {
|
| 182 |
+
modeOK = true;
|
| 183 |
+
f.content = snapshot.parsed?.textContent;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
if (modeOK && (mode.includes('lm') ||
|
| 187 |
+
(!mode.includes('markdown') && !mode.includes('content')))
|
| 188 |
+
) {
|
| 189 |
const dt = Date.now() - t0;
|
| 190 |
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 191 |
|
|
|
|
| 398 |
.value();
|
| 399 |
}
|
| 400 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 401 |
+
const links = this.jsdomControl.inferSnapshot(snapshot).links;
|
| 402 |
+
|
| 403 |
+
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
| 404 |
+
formatted.links = links;
|
| 405 |
+
} else {
|
| 406 |
+
formatted.links = _.fromPairs(links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
|
| 407 |
+
}
|
| 408 |
}
|
| 409 |
|
| 410 |
Object.assign(f, formatted);
|
|
|
|
| 431 |
}
|
| 432 |
if (this.links) {
|
| 433 |
const linkSummaryChunks = ['Links/Buttons:'];
|
| 434 |
+
if (Array.isArray(this.links)) {
|
| 435 |
+
for (const [k, v] of this.links) {
|
| 436 |
+
linkSummaryChunks.push(`- [${k}](${v})`);
|
| 437 |
+
}
|
| 438 |
+
} else {
|
| 439 |
+
for (const [k, v] of Object.entries(this.links)) {
|
| 440 |
+
linkSummaryChunks.push(`- [${k}](${v})`);
|
| 441 |
+
}
|
| 442 |
}
|
| 443 |
if (linkSummaryChunks.length === 1) {
|
| 444 |
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
|
|
|
| 497 |
}
|
| 498 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 499 |
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 500 |
+
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
| 501 |
+
mixin.links = inferred.links;
|
| 502 |
+
} else {
|
| 503 |
+
mixin.links = _.fromPairs(inferred.links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
|
| 504 |
+
}
|
| 505 |
}
|
| 506 |
if (snapshot.status) {
|
| 507 |
const code = snapshot.status;
|
backend/functions/src/utils/tailwind-classes.ts
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|