Spaces:

bonesmasher
/

web_reader

Build error

App Files Files Community

nomagick commited on Jan 16, 2025

Commit

53821d0

unverified ·

1 Parent(s): 80b9a6a

fix: lm and related options

Browse files

Files changed (10) hide show

backend/functions/package-lock.json +5 -4
backend/functions/package.json +1 -1
backend/functions/src/cloud-functions/crawler.ts +60 -83
backend/functions/src/dto/scrapping-options.ts +51 -6
backend/functions/src/services/curl.ts +2 -2
backend/functions/src/services/jsdom.ts +57 -12
backend/functions/src/services/lm.ts +14 -11
backend/functions/src/services/puppeteer.ts +1 -1
backend/functions/src/services/snapshot-formatter.ts +31 -8
backend/functions/src/utils/tailwind-classes.ts +0 -0

backend/functions/package-lock.json CHANGED Viewed

@@ -16,7 +16,7 @@
         "axios": "^1.3.3",
         "bcrypt": "^5.1.0",
         "busboy": "^1.6.0",
-        "civkit": "^0.8.2-2eddf1b",
         "core-js": "^3.37.1",
         "cors": "^2.8.5",
         "dayjs": "^1.11.9",
@@ -3979,9 +3979,10 @@
       }
     },
     "node_modules/civkit": {
-      "version": "0.8.2-2eddf1b",
-      "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-2eddf1b.tgz",
-      "integrity": "sha512-iRYQKasePTQYIajPZpTh+uQn09XF7e6+tBaFwxs7mlUIHoU8ci8CT307ITYMnppDLzCh7BRpSgt53mz4Jwg78w==",
       "dependencies": {
         "lodash": "^4.17.21",
         "tslib": "^2.5.0"

         "axios": "^1.3.3",
         "bcrypt": "^5.1.0",
         "busboy": "^1.6.0",
+        "civkit": "^0.8.2-4c0357a",
         "core-js": "^3.37.1",
         "cors": "^2.8.5",
         "dayjs": "^1.11.9",
       }
     },
     "node_modules/civkit": {
+      "version": "0.8.2-4c0357a",
+      "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz",
+      "integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==",
+      "license": "AGPL",
       "dependencies": {
         "lodash": "^4.17.21",
         "tslib": "^2.5.0"

backend/functions/package.json CHANGED Viewed

@@ -36,7 +36,7 @@
     "axios": "^1.3.3",
     "bcrypt": "^5.1.0",
     "busboy": "^1.6.0",
-    "civkit": "^0.8.2-2eddf1b",
     "core-js": "^3.37.1",
     "cors": "^2.8.5",
     "dayjs": "^1.11.9",

     "axios": "^1.3.3",
     "bcrypt": "^5.1.0",
     "busboy": "^1.6.0",
+    "civkit": "^0.8.2-4c0357a",
     "core-js": "^3.37.1",
     "cors": "^2.8.5",
     "dayjs": "^1.11.9",

backend/functions/src/cloud-functions/crawler.ts CHANGED Viewed

@@ -15,7 +15,7 @@ import { randomUUID } from 'crypto';
 import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
 import { countGPTToken as estimateToken } from '../shared/utils/openai';
-import { CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
 import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 import { DomainBlockade } from '../db/domain-blockade';
 import { DomainProfile } from '../db/domain-profile';
@@ -84,14 +84,6 @@ export class CrawlerHost extends RPCHost {
                 Reflect.set(snapshot, 'locale', options.locale);
             }
             await this.setToCache(options.url, snapshot);
-            if (!options.engine) {
-                try {
-                    await this.exploreDirectEngine(options.url, options, snapshot);
-                } catch (err) {
-                    this.logger.warn(`Failed to explore direct engine option for ${options.url.href}`, { err });
-                }
-            }
         });
         puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@@ -152,8 +144,8 @@ export class CrawlerHost extends RPCHost {
             memory: '4GiB',
             cpu: 2,
             timeoutSeconds: 300,
-            concurrency: 8,
-            maxInstances: 1250,
             minInstances: 1,
         },
         tags: ['Crawler'],
@@ -260,25 +252,12 @@ export class CrawlerHost extends RPCHost {
         const crawlOpts = await this.configure(crawlerOptions);
-        if (!crawlOpts.engine) {
-            const domainProfile = (await DomainProfile.fromFirestoreQuery(
-                DomainProfile.COLLECTION
-                    .where('origin', '==', targetUrl.origin.toLowerCase())
-                    .limit(1)
-            ))[0];
-            if (domainProfile?.engine) {
-                crawlOpts.engine = domainProfile.engine;
-            }
-        }
         if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
             const sseStream = new OutputServerEventStream();
             rpcReflect.return(sseStream);
             try {
-                for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
                     if (!scrapped) {
                         continue;
                     }
@@ -311,7 +290,7 @@ export class CrawlerHost extends RPCHost {
         let lastScrapped;
         if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
-            for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
                 lastScrapped = scrapped;
                 if (!crawlerOptions.isEarlyReturnApplicable()) {
                     continue;
@@ -357,7 +336,7 @@ export class CrawlerHost extends RPCHost {
             });
         }
-        for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
             lastScrapped = scrapped;
             if (!crawlerOptions.isEarlyReturnApplicable()) {
@@ -589,82 +568,78 @@ export class CrawlerHost extends RPCHost {
         return r;
     }
     async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
-        let overrideFinalSnapshot;
         if (crawlerOpts?.html) {
-            overrideFinalSnapshot = {
                 href: urlToCrawl.toString(),
                 html: crawlerOpts.html,
                 title: '',
                 text: '',
             } as PageSnapshot;
         }
         if (crawlerOpts?.pdf) {
             const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
             const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
-            overrideFinalSnapshot = {
                 href: urlToCrawl.toString(),
                 html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
                 title: '',
                 text: '',
                 pdfs: [pdfDataUrl],
             } as PageSnapshot;
-        }
-        if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
-            yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
             return;
         }
-        // if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
-        //     const rmSelectorEquivalent = [];
-        //     if (typeof crawlOpts.removeSelector === 'string') {
-        //         rmSelectorEquivalent.push(crawlOpts.removeSelector);
-        //     } else if (Array.isArray(crawlOpts.removeSelector)) {
-        //         rmSelectorEquivalent.push(...crawlOpts.removeSelector);
-        //     }
-        //     rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
-        //     const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
-        //         ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
-        //     }, crawlerOpts);
-        //     yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
-        //     return;
-        // }
-        if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
-            const rmSelectorEquivalent = [];
-            if (typeof crawlOpts.removeSelector === 'string') {
-                rmSelectorEquivalent.push(crawlOpts.removeSelector);
-            } else if (Array.isArray(crawlOpts.removeSelector)) {
-                rmSelectorEquivalent.push(...crawlOpts.removeSelector);
-            }
-            rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
-            const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
-                ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
-            }, crawlerOpts);
-            if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
-                const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
-                yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
                 return;
             }
-            yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
-            return;
-        }
-        if (overrideFinalSnapshot) {
-            yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts);
-            return;
         }
         let cache;
@@ -857,12 +832,14 @@ export class CrawlerHost extends RPCHost {
         nominalUrl?: URL,
         urlValidMs?: number
     ) {
-        const engine = crawlerOptions.engine?.toLowerCase() || '';
-        if (engine.includes('lm')) {
             const output: FormattedPage = {
                 title: snapshot.title,
                 content: snapshot.parsed?.textContent,
-                url: snapshot.href,
                 [Symbol.dispose]: () => undefined,
             };
@@ -874,7 +851,7 @@ export class CrawlerHost extends RPCHost {
             return output;
         }
-        return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
     }
     async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
@@ -902,7 +879,7 @@ export class CrawlerHost extends RPCHost {
     }
     async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
-        const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
         let lastSnapshot;
         let goodEnough = false;
@@ -936,7 +913,7 @@ export class CrawlerHost extends RPCHost {
     }
     async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
-        const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions);
         const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
         const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);

 import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
 import { countGPTToken as estimateToken } from '../shared/utils/openai';
+import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
 import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 import { DomainBlockade } from '../db/domain-blockade';
 import { DomainProfile } from '../db/domain-profile';
                 Reflect.set(snapshot, 'locale', options.locale);
             }
             await this.setToCache(options.url, snapshot);
         });
         puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
             memory: '4GiB',
             cpu: 2,
             timeoutSeconds: 300,
+            concurrency: 10,
+            maxInstances: 1000,
             minInstances: 1,
         },
         tags: ['Crawler'],
         const crawlOpts = await this.configure(crawlerOptions);
         if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
             const sseStream = new OutputServerEventStream();
             rpcReflect.return(sseStream);
             try {
+                for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
                     if (!scrapped) {
                         continue;
                     }
         let lastScrapped;
         if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
+            for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
                 lastScrapped = scrapped;
                 if (!crawlerOptions.isEarlyReturnApplicable()) {
                     continue;
             });
         }
+        for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
             lastScrapped = scrapped;
             if (!crawlerOptions.isEarlyReturnApplicable()) {
         return r;
     }
+    async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
+        // if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) {
+        //     const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
+        //         ...crawlOpts, engine: ENGINE_TYPE.BROWSER
+        //     }, crawlerOpts);
+        //     yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
+        //     return;
+        // }
+        if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
+            const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
+                ...crawlOpts, engine: ENGINE_TYPE.AUTO
+            }, crawlerOpts);
+            if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
+                const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
+                yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
+                return;
+            }
+            yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
+            return;
+        }
+        yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts);
+    }
     async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
         if (crawlerOpts?.html) {
+            const snapshot = {
                 href: urlToCrawl.toString(),
                 html: crawlerOpts.html,
                 title: '',
                 text: '',
             } as PageSnapshot;
+            yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
+            return;
         }
         if (crawlerOpts?.pdf) {
             const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
             const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
+            const snapshot = {
                 href: urlToCrawl.toString(),
                 html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
                 title: '',
                 text: '',
                 pdfs: [pdfDataUrl],
             } as PageSnapshot;
+            yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
             return;
         }
+        if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) {
+            const engine = crawlOpts?.engine;
+            try {
+                const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
+                yield snapshot;
                 return;
+            } catch (err) {
+                if (!engine.endsWith('?')) {
+                    throw err;
+                }
             }
         }
         let cache;
         nominalUrl?: URL,
         urlValidMs?: number
     ) {
+        const presumedURL = crawlerOptions.base === 'eventual' ? new URL(snapshot.href) : nominalUrl;
+        const respondWith = crawlerOptions.respondWith;
+        if (respondWith === CONTENT_FORMAT.READER_LM || respondWith === CONTENT_FORMAT.VLM) {
             const output: FormattedPage = {
                 title: snapshot.title,
                 content: snapshot.parsed?.textContent,
+                url: presumedURL?.href || snapshot.href,
                 [Symbol.dispose]: () => undefined,
             };
             return output;
         }
+        return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs);
     }
     async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
     }
     async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
+        const it = this.iterSnapshots(url, { ...opts, minIntervalMs: 500 });
         let lastSnapshot;
         let goodEnough = false;
     }
     async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
+        const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true);
         const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
         const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);

backend/functions/src/dto/scrapping-options.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
 import type { Request, Response } from 'express';
 import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
@@ -9,9 +9,12 @@ export enum CONTENT_FORMAT {
     TEXT = 'text',
     PAGESHOT = 'pageshot',
     SCREENSHOT = 'screenshot',
 }
 export enum ENGINE_TYPE {
     BROWSER = 'browser',
     DIRECT = 'direct',
     VLM = 'vlm',
@@ -22,6 +25,8 @@ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
 export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
 const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
 class Viewport extends AutoCastable {
     @Prop({
@@ -193,6 +198,11 @@ class Viewport extends AutoCastable {
                     in: 'header',
                     schema: { type: 'string' }
                 },
             }
         }
     }
@@ -205,6 +215,12 @@ export class CrawlerOptions extends AutoCastable {
     @Prop()
     html?: string;
     @Prop({
         desc: 'Base64 encoded PDF.',
         type: [File, String]
@@ -228,7 +244,7 @@ export class CrawlerOptions extends AutoCastable {
     @Prop({
         default: false,
     })
-    withLinksSummary!: boolean;
     @Prop({
         default: false,
@@ -335,6 +351,17 @@ export class CrawlerOptions extends AutoCastable {
         if (customMode !== undefined) {
             instance.respondWith = customMode;
         }
         const locale = ctx?.req.get('x-locale');
         if (locale !== undefined) {
@@ -352,7 +379,11 @@ export class CrawlerOptions extends AutoCastable {
         }
         const withLinksSummary = ctx?.req.get('x-with-links-summary');
         if (withLinksSummary !== undefined) {
-            instance.withLinksSummary = Boolean(withLinksSummary);
         }
         const withImagesSummary = ctx?.req.get('x-with-images-summary');
         if (withImagesSummary !== undefined) {
@@ -403,8 +434,15 @@ export class CrawlerOptions extends AutoCastable {
         if (engine) {
             instance.engine = engine;
         }
-        if (instance.noCache || !instance.isTypicalRequest()) {
-            instance.engine ??= ENGINE_TYPE.BROWSER;
         }
         const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
@@ -451,10 +489,17 @@ export class CrawlerOptions extends AutoCastable {
         const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
         instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
         if (instance.cacheTolerance) {
             instance.cacheTolerance = instance.cacheTolerance * 1000;
         }
         return instance;
     }
@@ -468,7 +513,7 @@ export class CrawlerOptions extends AutoCastable {
         if (this.injectFrameScript?.length || this.injectPageScript?.length) {
             return false;
         }
-        if (this.engine?.toLowerCase().includes('lm')) {
             return false;
         }

+import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
 import type { Request, Response } from 'express';
 import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
     TEXT = 'text',
     PAGESHOT = 'pageshot',
     SCREENSHOT = 'screenshot',
+    VLM = 'vlm',
+    READER_LM = 'readerlm-v2',
 }
 export enum ENGINE_TYPE {
+    AUTO = 'auto',
     BROWSER = 'browser',
     DIRECT = 'direct',
     VLM = 'vlm',
 export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
 const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
+export const BASE_URL_MODES = ['initial', 'eventual'] as const;
+const BASE_URL_MODE_VALUES = new Set<string>(BASE_URL_MODES);
 class Viewport extends AutoCastable {
     @Prop({
                     in: 'header',
                     schema: { type: 'string' }
                 },
+                'X-Base': {
+                    description: 'Select base modes of relative URLs.\n\nSupported: initial, eventual',
+                    in: 'header',
+                    schema: { type: 'string' }
+                },
             }
         }
     }
     @Prop()
     html?: string;
+    @Prop({
+        type: BASE_URL_MODE_VALUES,
+        default: 'initial',
+    })
+    base?: typeof BASE_URL_MODES[number];
     @Prop({
         desc: 'Base64 encoded PDF.',
         type: [File, String]
     @Prop({
         default: false,
     })
+    withLinksSummary!: boolean | string;
     @Prop({
         default: false,
         if (customMode !== undefined) {
             instance.respondWith = customMode;
         }
+        if (instance.respondWith) {
+            instance.respondWith = instance.respondWith.toLowerCase();
+        }
+        if (instance.respondWith?.includes('lm')) {
+            if (instance.respondWith.includes('content') || instance.respondWith.includes('markdown')) {
+                throw new ParamValidationError({
+                    path: 'respondWith',
+                    message: `LM formats conflicts with content/markdown.`,
+                });
+            }
+        }
         const locale = ctx?.req.get('x-locale');
         if (locale !== undefined) {
         }
         const withLinksSummary = ctx?.req.get('x-with-links-summary');
         if (withLinksSummary !== undefined) {
+            if (withLinksSummary === 'all') {
+                instance.withLinksSummary = withLinksSummary;
+            } else {
+                instance.withLinksSummary = Boolean(withLinksSummary);
+            }
         }
         const withImagesSummary = ctx?.req.get('x-with-images-summary');
         if (withImagesSummary !== undefined) {
         if (engine) {
             instance.engine = engine;
         }
+        if (instance.engine) {
+            instance.engine = instance.engine.toLowerCase();
+        }
+        if (instance.engine === ENGINE_TYPE.VLM) {
+            instance.engine = ENGINE_TYPE.BROWSER;
+            instance.respondWith = CONTENT_FORMAT.VLM;
+        } else if (instance.engine === ENGINE_TYPE.READER_LM) {
+            instance.engine = undefined;
+            instance.respondWith = CONTENT_FORMAT.READER_LM;
         }
         const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
         const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
         instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
+        const baseMode = ctx?.req.get('x-base') || undefined;
+        instance.base ??= baseMode as any;
         if (instance.cacheTolerance) {
             instance.cacheTolerance = instance.cacheTolerance * 1000;
         }
+        if (instance.noCache || !instance.isTypicalRequest()) {
+            instance.engine ??= ENGINE_TYPE.BROWSER + '?';
+        }
         return instance;
     }
         if (this.injectFrameScript?.length || this.injectPageScript?.length) {
             return false;
         }
+        if (this.respondWith.includes('lm')) {
             return false;
         }

backend/functions/src/services/curl.ts CHANGED Viewed

@@ -26,7 +26,7 @@ export class CurlControl extends AsyncService {
         this.emit('ready');
     }
-    async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
         const result = await new Promise<{
             statusCode: number,
             data: string,
@@ -75,7 +75,7 @@ export class CurlControl extends AsyncService {
             curl.perform();
         });
-        if (result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
             throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
         }

         this.emit('ready');
     }
+    async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
         const result = await new Promise<{
             statusCode: number,
             data: string,
             curl.perform();
         });
+        if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
             throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
         }

backend/functions/src/services/jsdom.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import { Readability } from '@mozilla/readability';
 import TurndownService from 'turndown';
 import { Threaded } from '../shared/services/threaded';
 import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
 const pLinkedom = import('linkedom');
@@ -184,26 +185,20 @@ export class JSDomControl extends AsyncService {
             jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
             const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
-                .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
-                .map(([href, text]) => {
-                    if (!text) {
                         return undefined;
                     }
                     try {
                         const parsed = new URL(href, snapshot.rebase || snapshot.href);
-                        if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
-                            return undefined;
-                        }
-                        return [parsed.toString(), text] as const;
                     } catch (err) {
                         return undefined;
                     }
                 })
-                .filter(Boolean)
-                .reduce((acc, pair) => {
-                    acc[pair![0]] = pair![1];
-                    return acc;
-                }, {} as { [k: string]: string; });
             extendedSnapshot.links = links;
@@ -237,6 +232,56 @@ export class JSDomControl extends AsyncService {
         return extendedSnapshot;
     }
     snippetToElement(snippet?: string, url?: string) {
         const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');

 import TurndownService from 'turndown';
 import { Threaded } from '../shared/services/threaded';
 import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
+import { tailwindClasses } from '../utils/tailwind-classes';
 const pLinkedom = import('linkedom');
             jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
             const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
+                .map((x: any) => [x.textContent.replace(/\s+/g, ' ').trim(), x.getAttribute('href'),])
+                .map(([text, href]) => {
+                    if (!href) {
                         return undefined;
                     }
                     try {
                         const parsed = new URL(href, snapshot.rebase || snapshot.href);
+                        return [text, parsed.toString()] as const;
                     } catch (err) {
                         return undefined;
                     }
                 })
+                .filter(Boolean) as [string, string][];
             extendedSnapshot.links = links;
         return extendedSnapshot;
     }
+    cleanRedundantEmptyLines(text: string) {
+        const lines = text.split(/\r?\n/g);
+        const mappedFlag = lines.map((line) => Boolean(line.trim()));
+        return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
+    }
+    @Threaded()
+    async cleanHTMLforLMs(sourceHTML: string, ...discardSelectors: string[]): Promise<string> {
+        const t0 = Date.now();
+        let jsdom = this.linkedom.parseHTML(sourceHTML);
+        if (!jsdom.window.document.documentElement) {
+            jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`);
+        }
+        for (const rl of discardSelectors) {
+            jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
+        }
+        jsdom.window.document.querySelectorAll('img[src],img[data-src]').forEach((x) => {
+            const src = x.getAttribute('src') || x.getAttribute('data-src');
+            if (src?.startsWith('data:')) {
+                x.setAttribute('src', 'blob:opaque');
+            }
+            x.removeAttribute('data-src');
+            x.removeAttribute('srcset');
+        });
+        jsdom.window.document.querySelectorAll('[class]').forEach((x) => {
+            const classes = x.getAttribute('class')?.split(/\s+/g) || [];
+            const newClasses = classes.filter((c) => tailwindClasses.has(c));
+            x.setAttribute('class', newClasses.join(' '));
+        });
+        jsdom.window.document.querySelectorAll('[style]').forEach((x) => {
+            const style = x.getAttribute('style')?.toLocaleLowerCase() || '';
+            if (style.startsWith('display: none')) {
+                return;
+            }
+            x.removeAttribute('style');
+        });
+        const dt = Date.now() - t0;
+        if (dt > 1000) {
+            this.logger.warn(`Performance issue: Cleaning HTML for LMs took ${dt}ms`, { dt });
+        }
+        return this.cleanRedundantEmptyLines(jsdom.window.document.documentElement.outerHTML);
+    }
     snippetToElement(snippet?: string, url?: string) {
         const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');

backend/functions/src/services/lm.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import { Logger } from '../shared/services/logger';
 import _ from 'lodash';
 import { AssertionFailureError } from 'civkit';
 import { LLMManager } from '../shared/services/common-llm';
 const tripleBackTick = '```';
@@ -16,7 +17,8 @@ export class LmControl extends AsyncService {
     constructor(
         protected globalLogger: Logger,
-        protected commonLLM: LLMManager
     ) {
         super(...arguments);
     }
@@ -27,13 +29,6 @@ export class LmControl extends AsyncService {
         this.emit('ready');
     }
-    cleanRedundantEmptyLines(text: string) {
-        const lines = text.split(/\r?\n/g);
-        const mappedFlag = lines.map((line) => Boolean(line.trim()));
-        return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
-    }
     async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
         pageshotUrl?: string,
     }) {
@@ -43,9 +38,11 @@ export class LmControl extends AsyncService {
             throw new AssertionFailureError('Screenshot of the page is not available');
         }
         const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
             prompt: [
-                `HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
                 typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
                 `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
             ],
@@ -76,8 +73,11 @@ export class LmControl extends AsyncService {
         if (!snapshot) {
             throw new AssertionFailureError('Snapshot of the page is not available');
         }
         const it = this.commonLLM.iterRun('readerlm-v2', {
-            prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
             options: {
                 // system: 'You are an AI assistant developed by Jina AI',
@@ -105,8 +105,11 @@ export class LmControl extends AsyncService {
         if (!snapshot) {
             throw new AssertionFailureError('Snapshot of the page is not available');
         }
         const it = this.commonLLM.iterRun('readerlm-v2', {
-            prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
             options: {
                 // system: 'You are an AI assistant developed by Jina AI',
                 stream: true

 import _ from 'lodash';
 import { AssertionFailureError } from 'civkit';
 import { LLMManager } from '../shared/services/common-llm';
+import { JSDomControl } from './jsdom';
 const tripleBackTick = '```';
     constructor(
         protected globalLogger: Logger,
+        protected commonLLM: LLMManager,
+        protected jsdomControl: JSDomControl,
     ) {
         super(...arguments);
     }
         this.emit('ready');
     }
     async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
         pageshotUrl?: string,
     }) {
             throw new AssertionFailureError('Screenshot of the page is not available');
         }
+        const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg')
         const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
             prompt: [
+                `HTML: \n${html}\n\nSCREENSHOT: \n`,
                 typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
                 `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
             ],
         if (!snapshot) {
             throw new AssertionFailureError('Snapshot of the page is not available');
         }
+        const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
         const it = this.commonLLM.iterRun('readerlm-v2', {
+            prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n`,
             options: {
                 // system: 'You are an AI assistant developed by Jina AI',
         if (!snapshot) {
             throw new AssertionFailureError('Snapshot of the page is not available');
         }
+        const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
         const it = this.commonLLM.iterRun('readerlm-v2', {
+            prompt: `${instruction}\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
             options: {
                 // system: 'You are an AI assistant developed by Jina AI',
                 stream: true

backend/functions/src/services/puppeteer.ts CHANGED Viewed

@@ -63,7 +63,7 @@ export interface PageSnapshot {
 }
 export interface ExtendedSnapshot extends PageSnapshot {
-    links: { [url: string]: string; };
     imgs: ImgBrief[];
 }

 }
 export interface ExtendedSnapshot extends PageSnapshot {
+    links: [string, string][];
     imgs: ImgBrief[];
 }

backend/functions/src/services/snapshot-formatter.ts CHANGED Viewed

@@ -28,8 +28,8 @@ export interface FormattedPage {
     screenshot?: Buffer;
     pageshotUrl?: string;
     pageshot?: Buffer;
-    links?: { [k: string]: string; };
-    images?: { [k: string]: string; };
     warning?: string;
     usage?: {
         total_tokens?: number;
@@ -56,7 +56,7 @@ export function highlightedCodeBlock(turndownService: TurndownService) {
                 highlightRegExp.test(node.className)
             );
         },
-        replacement: (_content, node, options)=> {
             const className = (node as any).className || '';
             const language = (className.match(highlightRegExp) || [null, ''])[1];
@@ -178,7 +178,14 @@ export class SnapshotFormatter extends AsyncService {
             Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true });
         }
-        if (modeOK && !mode.includes('markdown') && !mode.includes('content')) {
             const dt = Date.now() - t0;
             this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
@@ -391,7 +398,13 @@ export class SnapshotFormatter extends AsyncService {
                     .value();
         }
         if (this.threadLocal.get('withLinksSummary')) {
-            formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
         }
         Object.assign(f, formatted);
@@ -418,8 +431,14 @@ export class SnapshotFormatter extends AsyncService {
             }
             if (this.links) {
                 const linkSummaryChunks = ['Links/Buttons:'];
-                for (const [k, v] of Object.entries(this.links)) {
-                    linkSummaryChunks.push(`- [${k}](${v})`);
                 }
                 if (linkSummaryChunks.length === 1) {
                     linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
@@ -478,7 +497,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
         }
         if (this.threadLocal.get('withLinksSummary')) {
             inferred ??= this.jsdomControl.inferSnapshot(snapshot);
-            mixin.links = _.invert(inferred.links || {});
         }
         if (snapshot.status) {
             const code = snapshot.status;

     screenshot?: Buffer;
     pageshotUrl?: string;
     pageshot?: Buffer;
+    links?: { [k: string]: string; } | [string, string][];
+    images?: { [k: string]: string; } | [string, string][];
     warning?: string;
     usage?: {
         total_tokens?: number;
                 highlightRegExp.test(node.className)
             );
         },
+        replacement: (_content, node, options) => {
             const className = (node as any).className || '';
             const language = (className.match(highlightRegExp) || [null, ''])[1];
             Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true });
         }
+        if (mode.includes('lm')) {
+            modeOK = true;
+            f.content = snapshot.parsed?.textContent;
+        }
+        if (modeOK && (mode.includes('lm') ||
+            (!mode.includes('markdown') && !mode.includes('content')))
+        ) {
             const dt = Date.now() - t0;
             this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
                     .value();
         }
         if (this.threadLocal.get('withLinksSummary')) {
+            const links = this.jsdomControl.inferSnapshot(snapshot).links;
+            if (this.threadLocal.get('withLinksSummary') === 'all') {
+                formatted.links = links;
+            } else {
+                formatted.links = _.fromPairs(links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
+            }
         }
         Object.assign(f, formatted);
             }
             if (this.links) {
                 const linkSummaryChunks = ['Links/Buttons:'];
+                if (Array.isArray(this.links)) {
+                    for (const [k, v] of this.links) {
+                        linkSummaryChunks.push(`- [${k}](${v})`);
+                    }
+                } else {
+                    for (const [k, v] of Object.entries(this.links)) {
+                        linkSummaryChunks.push(`- [${k}](${v})`);
+                    }
                 }
                 if (linkSummaryChunks.length === 1) {
                     linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
         }
         if (this.threadLocal.get('withLinksSummary')) {
             inferred ??= this.jsdomControl.inferSnapshot(snapshot);
+            if (this.threadLocal.get('withLinksSummary') === 'all') {
+                mixin.links = inferred.links;
+            } else {
+                mixin.links = _.fromPairs(inferred.links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
+            }
         }
         if (snapshot.status) {
             const code = snapshot.status;

backend/functions/src/utils/tailwind-classes.ts ADDED Viewed

The diff for this file is too large to render. See raw diff