"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var __param = (this && this.__param) || function (paramIndex, decorator) { return function (target, key) { decorator(target, key, paramIndex); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; var _a, _b, _c, _d, _e, _f; Object.defineProperty(exports, "__esModule", { value: true }); exports.CrawlerHost = void 0; const tsyringe_1 = require("tsyringe"); const url_1 = require("url"); const crypto_1 = require("crypto"); const lodash_1 = __importDefault(require("lodash")); const civ_rpc_1 = require("civkit/civ-rpc"); const lang_1 = require("civkit/lang"); const defer_1 = require("civkit/defer"); const decorators_1 = require("civkit/decorators"); const fancy_file_1 = require("civkit/fancy-file"); const crawler_options_1 = require("../dto/crawler-options"); const crawled_1 = require("../db/crawled"); const domain_blockade_1 = require("../db/domain-blockade"); const transform_server_event_stream_1 = require("../lib/transform-server-event-stream"); const puppeteer_1 = require("../services/puppeteer"); const jsdom_1 = require("../services/jsdom"); const snapshot_formatter_1 = require("../services/snapshot-formatter"); const curl_1 = require("../services/curl"); const lm_1 = require("../services/lm"); const misc_1 = require("../utils/misc"); const cf_browser_rendering_1 = require("../services/cf-browser-rendering"); const logger_1 = require("../services/logger"); const rate_limit_1 = require("../shared/services/rate-limit"); const async_context_1 = require("../services/async-context"); const registry_1 = require("../services/registry"); const errors_1 = require("../services/errors"); const openai_1 = require("../shared/utils/openai"); const proxy_provider_1 = require("../shared/services/proxy-provider"); const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket"); const jina_embeddings_auth_1 = require("../dto/jina-embeddings-auth"); const robots_text_1 = require("../services/robots-text"); const temp_file_1 = require("../services/temp-file"); const misc_2 = require("../services/misc"); const http_1 = require("civkit/http"); const geoip_1 = require("../services/geoip"); const indexProto = { toString: function () { return (0, lodash_1.default)(this) .toPairs() .map(([k, v]) => k ? `[${lodash_1.default.upperFirst(lodash_1.default.lowerCase(k))}] ${v}` : '') .value() .join('\n') + '\n'; } }; let CrawlerHost = class CrawlerHost extends civ_rpc_1.RPCHost { constructor(globalLogger, puppeteerControl, curlControl, cfBrowserRendering, proxyProvider, lmControl, jsdomControl, snapshotFormatter, firebaseObjectStorage, rateLimitControl, threadLocal, robotsTxtService, tempFileManager, geoIpService, miscService) { super(...arguments); this.globalLogger = globalLogger; this.puppeteerControl = puppeteerControl; this.curlControl = curlControl; this.cfBrowserRendering = cfBrowserRendering; this.proxyProvider = proxyProvider; this.lmControl = lmControl; this.jsdomControl = jsdomControl; this.snapshotFormatter = snapshotFormatter; this.firebaseObjectStorage = firebaseObjectStorage; this.rateLimitControl = rateLimitControl; this.threadLocal = threadLocal; this.robotsTxtService = robotsTxtService; this.tempFileManager = tempFileManager; this.geoIpService = geoIpService; this.miscService = miscService; this.logger = this.globalLogger.child({ service: this.constructor.name }); this.cacheRetentionMs = 1000 * 3600 * 24 * 7; this.cacheValidMs = 1000 * 3600; this.urlValidMs = 1000 * 3600 * 4; this.abuseBlockMs = 1000 * 3600; this.domainProfileRetentionMs = 1000 * 3600 * 24 * 30; this.batchedCaches = []; this.proxyIterMap = new WeakMap(); puppeteerControl.on('crawled', async (snapshot, options) => { if (!snapshot.title?.trim() && !snapshot.pdfs?.length) { return; } if (options.cookies?.length || options.private) { // Potential privacy issue, dont cache if cookies are used return; } if (options.injectFrameScripts?.length || options.injectPageScripts?.length || options.viewport) { // Potentially mangeled content, dont cache if scripts are injected return; } if (snapshot.isIntermediate) { return; } if (!snapshot.lastMutationIdle) { // Never reached mutationIdle, presumably too short timeout return; } if (options.locale) { Reflect.set(snapshot, 'locale', options.locale); } const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html); if (analyzed.tokens < 200) { // Does not contain enough content if (snapshot.status !== 200) { return; } if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile')) { return; } } await this.setToCache(options.url, snapshot); }); puppeteerControl.on('abuse', async (abuseEvent) => { this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn }); await domain_blockade_1.DomainBlockade.save(domain_blockade_1.DomainBlockade.from({ domain: abuseEvent.url.hostname.toLowerCase(), triggerReason: `${abuseEvent.reason}`, triggerUrl: abuseEvent.url.toString(), createdAt: new Date(), expireAt: new Date(Date.now() + this.abuseBlockMs), })).catch((err) => { this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: (0, lang_1.marshalErrorLike)(err) }); }); }); setInterval(() => { const thisBatch = this.batchedCaches; this.batchedCaches = []; if (!thisBatch.length) { return; } const batch = crawled_1.Crawled.DB.batch(); for (const x of thisBatch) { batch.set(crawled_1.Crawled.COLLECTION.doc(x._id), x.degradeForFireStore(), { merge: true }); } batch.commit() .then(() => { this.logger.debug(`Saved ${thisBatch.length} caches by batch`); }) .catch((err) => { this.logger.warn(`Failed to save cache in batch`, { err }); }); }, 1000 * 10 + Math.round(1000 * Math.random())).unref(); } async init() { await this.dependencyReady(); if (this.puppeteerControl.effectiveUA) { this.curlControl.impersonateChrome(this.puppeteerControl.effectiveUA); } this.emit('ready'); } async getIndex(auth) { const indexObject = Object.create(indexProto); Object.assign(indexObject, { usage1: 'https://r.jina.ai/YOUR_URL', usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', homepage: 'https://jina.ai/reader', }); await auth?.solveUID(); if (auth && auth.user) { indexObject[''] = undefined; indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`; indexObject.balanceLeft = auth.user.wallet.total_balance; } return indexObject; } async getIndexCtrl(ctx, auth) { const indexObject = await this.getIndex(auth); if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { return indexObject; } return (0, civ_rpc_1.assignTransferProtocolMeta)(`${indexObject}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } async crawl(rpcReflect, ctx, auth, crawlerOptionsHeaderOnly, crawlerOptionsParamsAllowed) { const uid = await auth.solveUID(); let chargeAmount = 0; const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed; const tierPolicy = await this.saasAssertTierPolicy(crawlerOptions, auth); // Use koa ctx.URL, a standard URL object to avoid node.js framework prop naming confusion const targetUrl = await this.getTargetUrl((0, misc_1.tryDecodeURIComponent)(`${ctx.URL.pathname}${ctx.URL.search}`), crawlerOptions); if (!targetUrl) { return await this.getIndex(auth); } // Prevent circular crawling this.puppeteerControl.circuitBreakerHosts.add(ctx.hostname.toLowerCase()); if (uid) { const user = await auth.assertUser(); if (!(user.wallet.total_balance > 0)) { throw new errors_1.InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`); } const rateLimitPolicy = auth.getRateLimits('CRAWL') || [ parseInt(user.metadata?.speed_level) >= 2 ? rate_limit_1.RateLimitDesc.from({ occurrence: 5000, periodSeconds: 60 }) : rate_limit_1.RateLimitDesc.from({ occurrence: 500, periodSeconds: 60 }) ]; const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'], ...rateLimitPolicy); rpcReflect.finally(() => { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { return; } if (chargeAmount) { auth.reportUsage(chargeAmount, `reader-crawl`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: (0, lang_1.marshalErrorLike)(err) }); }); apiRoll.chargeAmount = chargeAmount; } }); } else if (ctx.ip) { const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, ['CRAWL'], [ // 20 requests per minute new Date(Date.now() - 60 * 1000), 20 ]); rpcReflect.finally(() => { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { return; } apiRoll.chargeAmount = chargeAmount; }); } if (!uid) { // Enforce no proxy is allocated for anonymous users due to abuse. crawlerOptions.proxy = 'none'; const blockade = (await domain_blockade_1.DomainBlockade.fromFirestoreQuery(domain_blockade_1.DomainBlockade.COLLECTION .where('domain', '==', targetUrl.hostname.toLowerCase()) .where('expireAt', '>=', new Date()) .limit(1)))[0]; if (blockade) { throw new errors_1.SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); } } const crawlOpts = await this.configure(crawlerOptions); this.logger.info(`Accepting request from ${uid || ctx.ip}`, { opts: crawlerOptions }); if (crawlerOptions.robotsTxt) { await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt); } if (rpcReflect.signal.aborted) { return; } if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) { const sseStream = new transform_server_event_stream_1.OutputServerEventStream(); rpcReflect.return(sseStream); try { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { if (!scrapped) { continue; } if (rpcReflect.signal.aborted) { break; } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); sseStream.write({ event: 'data', data: formatted, }); if (chargeAmount && scrapped.pdfs?.length) { break; } } } catch (err) { this.logger.error(`Failed to crawl ${targetUrl}`, { err: (0, lang_1.marshalErrorLike)(err) }); sseStream.write({ event: 'error', data: (0, lang_1.marshalErrorLike)(err), }); } sseStream.end(); return sseStream; } let lastScrapped; if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { try { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; if (rpcReflect.signal.aborted) { break; } if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { continue; } if (!scrapped.title) { continue; } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (scrapped?.pdfs?.length && !chargeAmount) { continue; } return formatted; } } catch (err) { if (!lastScrapped) { throw err; } } if (!lastScrapped) { if (crawlOpts.targetSelector) { throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`); } throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl}`); } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); return formatted; } if (crawlerOptions.isRequestingCompoundContentFormat()) { throw new civ_rpc_1.ParamValidationError({ path: 'respondWith', message: `You are requesting compound content format, please explicitly accept 'text/event-stream' or 'application/json' in header.` }); } try { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; if (rpcReflect.signal.aborted) { break; } if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { continue; } if (!scrapped.title) { continue; } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }); } if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }); } return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } } catch (err) { if (!lastScrapped) { throw err; } } if (!lastScrapped) { if (crawlOpts.targetSelector) { throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`); } throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl}`); } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }); } if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }); } return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } async getTargetUrl(originPath, crawlerOptions) { let url = ''; const targetUrlFromGet = originPath.slice(1); if (crawlerOptions.pdf) { const pdfFile = crawlerOptions.pdf; const identifier = pdfFile instanceof fancy_file_1.FancyFile ? (await pdfFile.sha256Sum) : (0, crypto_1.randomUUID)(); url = `blob://pdf/${identifier}`; crawlerOptions.url ??= url; } else if (targetUrlFromGet) { url = targetUrlFromGet.trim(); } else if (crawlerOptions.url) { url = crawlerOptions.url.trim(); } if (!url) { throw new civ_rpc_1.ParamValidationError({ message: 'No URL provided', path: 'url' }); } const { url: safeURL, ips } = await this.miscService.assertNormalizedUrl(url); if (this.puppeteerControl.circuitBreakerHosts.has(safeURL.hostname.toLowerCase())) { throw new errors_1.SecurityCompromiseError({ message: `Circular hostname: ${safeURL.protocol}`, path: 'url' }); } crawlerOptions._hintIps = ips; return safeURL; } getUrlDigest(urlToCrawl) { const normalizedURL = new URL(urlToCrawl); if (!normalizedURL.hash.startsWith('#/')) { normalizedURL.hash = ''; } const normalizedUrl = normalizedURL.toString().toLowerCase(); const digest = snapshot_formatter_1.md5Hasher.hash(normalizedUrl.toString()); return digest; } async *queryCache(urlToCrawl, cacheTolerance) { const digest = this.getUrlDigest(urlToCrawl); const cache = (await (crawled_1.Crawled.fromFirestoreQuery(crawled_1.Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)).catch((err) => { this.logger.warn(`Failed to query cache, unknown issue`, { err }); // https://github.com/grpc/grpc-node/issues/2647 // https://github.com/googleapis/nodejs-firestore/issues/1023 // https://github.com/googleapis/nodejs-firestore/issues/1023 return undefined; })))?.[0]; yield cache; if (!cache) { return; } const age = Date.now() - cache.createdAt.valueOf(); const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance); this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { url: urlToCrawl, digest, age, stale, cacheTolerance }); let snapshot; let screenshotUrl; let pageshotUrl; const preparations = [ this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => { snapshot = JSON.parse(r.toString('utf-8')); }), cache.screenshotAvailable ? this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => { screenshotUrl = r; }) : Promise.resolve(undefined), cache.pageshotAvailable ? this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => { pageshotUrl = r; }) : Promise.resolve(undefined) ]; try { await Promise.all(preparations); } catch (_err) { // Swallow cache errors. return undefined; } yield { isFresh: !stale, ...cache, snapshot: { ...snapshot, screenshot: undefined, pageshot: undefined, screenshotUrl, pageshotUrl, } }; } async setToCache(urlToCrawl, snapshot) { const digest = this.getUrlDigest(urlToCrawl); this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href }); const nowDate = new Date(); const cache = crawled_1.Crawled.from({ _id: (0, crypto_1.randomUUID)(), url: urlToCrawl.toString(), createdAt: nowDate, expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs, urlPathDigest: digest, }); const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`, Buffer.from(JSON.stringify({ ...snapshot, screenshot: undefined, pageshot: undefined, }), 'utf-8'), { metadata: { contentType: 'application/json', } }).then((r) => { cache.snapshotAvailable = true; return r; }); if (snapshot.screenshot) { await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, { metadata: { contentType: 'image/png', } }); cache.screenshotAvailable = true; } if (snapshot.pageshot) { await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, { metadata: { contentType: 'image/png', } }); cache.pageshotAvailable = true; } await savingOfSnapshot; this.batchedCaches.push(cache); // const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { // this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); // return undefined; // }); return cache; } async *iterSnapshots(urlToCrawl, crawlOpts, crawlerOpts) { // if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) { // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, { // ...crawlOpts, engine: ENGINE_TYPE.BROWSER // }, crawlerOpts); // yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot); // return; // } if (crawlerOpts?.respondWith.includes(crawler_options_1.CONTENT_FORMAT.READER_LM)) { const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, { ...crawlOpts, engine: crawlOpts?.engine || crawler_options_1.ENGINE_TYPE.AUTO, }, crawler_options_1.CrawlerOptions.from({ ...crawlerOpts, respondWith: 'html', })); if (!finalAutoSnapshot?.html) { throw new civ_rpc_1.AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`); } if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) { const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined; yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot); return; } try { yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot); } catch (err) { if (err instanceof http_1.HTTPServiceError && err.status === 429) { throw new errors_1.ServiceNodeResourceDrainError(`Reader LM is at capacity, please try again later.`); } throw err; } return; } yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts); } async *cachedScrap(urlToCrawl, crawlOpts, crawlerOpts) { if (crawlerOpts?.html) { const snapshot = { href: urlToCrawl.toString(), html: crawlerOpts.html, title: '', text: '', }; yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); return; } if (crawlerOpts?.pdf) { const pdfFile = crawlerOpts.pdf instanceof fancy_file_1.FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64')); const pdfLocalPath = (0, url_1.pathToFileURL)((await pdfFile.filePath)); const snapshot = { href: urlToCrawl.toString(), html: ``, title: '', text: '', pdfs: [pdfLocalPath.href], }; yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); return; } if (crawlOpts?.engine === crawler_options_1.ENGINE_TYPE.CURL || // deprecated name crawlOpts?.engine === 'direct') { let sideLoaded; try { sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ? await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) : await this.curlControl.sideLoad(urlToCrawl, crawlOpts); } catch (err) { if (err instanceof errors_1.ServiceBadAttemptError) { throw new civ_rpc_1.AssertionFailureError(err.message); } throw err; } if (!sideLoaded?.file) { throw new civ_rpc_1.AssertionFailureError(`Remote server did not return a body: ${urlToCrawl}`); } const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName); draftSnapshot.status = sideLoaded.status; draftSnapshot.statusText = sideLoaded.statusText; yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts); return; } if (crawlOpts?.engine === crawler_options_1.ENGINE_TYPE.CF_BROWSER_RENDERING) { const html = await this.cfBrowserRendering.fetchContent(urlToCrawl.href); const snapshot = { href: urlToCrawl.toString(), html, title: '', text: '', }; yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); return; } const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs; const cacheIt = this.queryCache(urlToCrawl, cacheTolerance); let cache = (await cacheIt.next()).value; if (cache?.htmlSignificantlyModifiedByJs === false) { if (crawlerOpts && crawlerOpts.timeout === undefined) { crawlerOpts.respondTiming ??= crawler_options_1.RESPOND_TIMING.HTML; } } if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) { cache = (await cacheIt.next()).value; } cacheIt.return(undefined); if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) && (lodash_1.default.get(cache.snapshot, 'locale') === crawlOpts?.locale)) { if (cache.snapshot) { cache.snapshot.isFromCache = true; } yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts); return; } if (crawlOpts?.engine !== crawler_options_1.ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) { const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() && [crawler_options_1.RESPOND_TIMING.HTML, crawler_options_1.RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming); try { const altOpts = { ...crawlOpts }; let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ? await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) : await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => { this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: (0, lang_1.marshalErrorLike)(err), href: urlToCrawl.href }); if (err instanceof civ_rpc_1.ApplicationError && !(err instanceof errors_1.ServiceBadAttemptError)) { return Promise.reject(err); } return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts); }); if (!sideLoaded.file) { throw new errors_1.ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`); } const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName).catch((err) => { if (err instanceof civ_rpc_1.ApplicationError) { return Promise.reject(new errors_1.ServiceBadAttemptError(err.message)); } return Promise.reject(err); }); draftSnapshot.status = sideLoaded.status; draftSnapshot.statusText = sideLoaded.statusText; if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) { yield draftSnapshot; return; } let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html); draftSnapshot.title ??= analyzed.title; draftSnapshot.isIntermediate = true; if (sideLoadSnapshotPermitted) { yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts); } let fallbackProxyIsUsed = false; if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy !== 'none') && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200)) { const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts); if (!proxyLoaded.file) { throw new errors_1.ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`); } const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName).catch((err) => { if (err instanceof civ_rpc_1.ApplicationError) { return Promise.reject(new errors_1.ServiceBadAttemptError(err.message)); } return Promise.reject(err); }); proxySnapshot.status = proxyLoaded.status; proxySnapshot.statusText = proxyLoaded.statusText; if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) { } analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html); if (proxyLoaded.status === 200 || analyzed.tokens >= 200) { proxySnapshot.isIntermediate = true; if (sideLoadSnapshotPermitted) { yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts); } sideLoaded = proxyLoaded; fallbackProxyIsUsed = true; } } if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) { this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href }); crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts; if (fallbackProxyIsUsed) { this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href }); } } } catch (err) { this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: (0, lang_1.marshalErrorLike)(err), href: urlToCrawl.href }); if (err instanceof civ_rpc_1.ApplicationError && !(err instanceof errors_1.ServiceBadAttemptError) && !(err instanceof civ_rpc_1.DataStreamBrokenError)) { throw err; } } } else if (crawlOpts?.allocProxy && crawlOpts.allocProxy !== 'none' && !crawlOpts.proxyUrl) { const proxyUrl = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(crawlOpts)); crawlOpts.proxyUrl = proxyUrl.href; } try { if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) { for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { yield this.jsdomControl.narrowSnapshot(x, crawlOpts); } return; } yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts); } catch (err) { if (cache && !(err instanceof errors_1.SecurityCompromiseError)) { this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: (0, lang_1.marshalErrorLike)(err) }); yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts); return; } throw err; } } assignChargeAmount(formatted, saasTierPolicy) { if (!formatted) { return 0; } let amount = 0; if (formatted.content) { amount = (0, openai_1.countGPTToken)(formatted.content); } else if (formatted.description) { amount += (0, openai_1.countGPTToken)(formatted.description); } if (formatted.text) { amount += (0, openai_1.countGPTToken)(formatted.text); } if (formatted.html) { amount += (0, openai_1.countGPTToken)(formatted.html); } if (formatted.screenshotUrl || formatted.screenshot) { // OpenAI image token count for 1024x1024 image amount += 765; } if (saasTierPolicy) { amount = this.saasApplyTierPolicy(saasTierPolicy, amount); } Object.assign(formatted, { usage: { tokens: amount } }); (0, civ_rpc_1.assignMeta)(formatted, { usage: { tokens: amount } }); return amount; } async *scrapMany(urls, options, crawlerOpts) { const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts)); const results = iterators.map((_x) => undefined); let nextDeferred = (0, defer_1.Defer)(); let concluded = false; const handler = async (it, idx) => { try { for await (const x of it) { results[idx] = x; if (x) { nextDeferred.resolve(); nextDeferred = (0, defer_1.Defer)(); } } } catch (err) { this.logger.warn(`Failed to scrap ${urls[idx]}`, { err: (0, lang_1.marshalErrorLike)(err) }); } }; Promise.allSettled(iterators.map((it, idx) => handler(it, idx))).finally(() => { concluded = true; nextDeferred.resolve(); }); yield results; try { while (!concluded) { await nextDeferred.promise; yield results; } yield results; } finally { for (const x of iterators) { x.return(); } } } async configure(opts) { this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt); this.threadLocal.set('withLinksSummary', opts.withLinksSummary); this.threadLocal.set('withImagesSummary', opts.withImagesSummary); this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl); this.threadLocal.set('cacheTolerance', opts.cacheTolerance); this.threadLocal.set('withIframe', opts.withIframe); this.threadLocal.set('withShadowDom', opts.withShadowDom); this.threadLocal.set('userAgent', opts.userAgent); if (opts.timeout) { this.threadLocal.set('timeout', opts.timeout * 1000); } this.threadLocal.set('retainImages', opts.retainImages); this.threadLocal.set('noGfm', opts.noGfm); this.threadLocal.set('DNT', Boolean(opts.doNotTrack)); if (opts.markdown) { this.threadLocal.set('turndownOpts', opts.markdown); } const crawlOpts = { proxyUrl: opts.proxyUrl, cookies: opts.setCookies, favorScreenshot: ['screenshot', 'pageshot'].some((x) => opts.respondWith.includes(x)), removeSelector: opts.removeSelector, targetSelector: opts.targetSelector, waitForSelector: opts.waitForSelector, overrideUserAgent: opts.userAgent, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, withIframe: opts.withIframe, withShadowDom: opts.withShadowDom, locale: opts.locale, referer: opts.referer, viewport: opts.viewport, engine: opts.engine, allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy, proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false, private: Boolean(opts.doNotTrack), }; if (crawlOpts.targetSelector?.length) { if (typeof crawlOpts.targetSelector === 'string') { crawlOpts.targetSelector = [crawlOpts.targetSelector]; } for (const s of crawlOpts.targetSelector) { for (const e of s.split(',').map((x) => x.trim())) { if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) { throw new civ_rpc_1.ParamValidationError({ message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`, path: 'targetSelector' }); } } } } if (opts._hintIps?.length) { const hints = await this.geoIpService.lookupCities(opts._hintIps); const board = {}; for (const x of hints) { if (x.country?.code) { board[x.country.code] = (board[x.country.code] || 0) + 1; } } const hintCountry = lodash_1.default.maxBy(Array.from(Object.entries(board)), 1)?.[0]; crawlOpts.countryHint = hintCountry?.toLowerCase(); } if (opts.locale) { crawlOpts.extraHeaders ??= {}; crawlOpts.extraHeaders['Accept-Language'] = opts.locale; } if (opts.respondWith.includes(crawler_options_1.CONTENT_FORMAT.VLM)) { crawlOpts.favorScreenshot = true; } if (opts.injectFrameScript?.length) { crawlOpts.injectFrameScripts = (await Promise.all(opts.injectFrameScript.map((x) => { if (URL.canParse(x)) { return fetch(x).then((r) => r.text()); } return x; }))).filter(Boolean); } if (opts.injectPageScript?.length) { crawlOpts.injectPageScripts = (await Promise.all(opts.injectPageScript.map((x) => { if (URL.canParse(x)) { return fetch(x).then((r) => r.text()); } return x; }))).filter(Boolean); } return crawlOpts; } async formatSnapshot(crawlerOptions, snapshot, nominalUrl, urlValidMs, scrappingOptions) { const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl; const respondWith = crawlerOptions.respondWith; if (respondWith === crawler_options_1.CONTENT_FORMAT.READER_LM || respondWith === crawler_options_1.CONTENT_FORMAT.VLM) { const output = { title: snapshot.title, content: snapshot.parsed?.textContent, url: presumedURL?.href || snapshot.href, }; Object.defineProperty(output, 'textRepresentation', { value: snapshot.parsed?.textContent, enumerable: false, }); return output; } return this.formatSnapshotWithPDFSideLoad(respondWith, snapshot, presumedURL, urlValidMs, scrappingOptions); } async formatSnapshotWithPDFSideLoad(mode, snapshot, nominalUrl, urlValidMs, scrappingOptions) { const snapshotCopy = lodash_1.default.cloneDeep(snapshot); if (snapshotCopy.pdfs?.length) { const pdfUrl = snapshotCopy.pdfs[0]; if (pdfUrl.startsWith('http')) { const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl]; if (sideLoaded?.status === 200 && sideLoaded.body) { snapshotCopy.pdfs[0] = (0, url_1.pathToFileURL)(await sideLoaded?.body.filePath).href; return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs); } const r = await this.curlControl.sideLoad(new URL(pdfUrl), scrappingOptions).catch((err) => { if (err instanceof errors_1.ServiceBadAttemptError) { return Promise.reject(new civ_rpc_1.AssertionFailureError(`Failed to load PDF(${pdfUrl}): ${err.message}`)); } return Promise.reject(err); }); if (r.status !== 200) { throw new civ_rpc_1.AssertionFailureError(`Failed to load PDF(${pdfUrl}): Server responded status ${r.status}`); } if (!r.contentType.includes('application/pdf')) { throw new civ_rpc_1.AssertionFailureError(`Failed to load PDF(${pdfUrl}): Server responded with wrong content type ${r.contentType}`); } if (!r.file) { throw new civ_rpc_1.AssertionFailureError(`Failed to load PDF(${pdfUrl}): Server did not return a body`); } snapshotCopy.pdfs[0] = (0, url_1.pathToFileURL)(await r.file.filePath).href; } } return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs); } async getFinalSnapshot(url, opts, crawlerOptions) { const it = this.cachedScrap(url, opts, crawlerOptions); let lastSnapshot; let lastError; try { for await (const x of it) { lastSnapshot = x; } } catch (err) { lastError = err; } if (!lastSnapshot && lastError) { throw lastError; } if (!lastSnapshot) { throw new civ_rpc_1.AssertionFailureError(`No content available`); } return lastSnapshot; } async simpleCrawl(mode, url, opts) { const it = this.iterSnapshots(url, { ...opts, minIntervalMs: 500 }); let lastSnapshot; let goodEnough = false; try { for await (const x of it) { lastSnapshot = x; if (goodEnough) { break; } if (lastSnapshot?.parsed?.content) { // After it's good enough, wait for next snapshot; goodEnough = true; } } } catch (err) { if (lastSnapshot) { return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } throw err; } if (!lastSnapshot) { throw new civ_rpc_1.AssertionFailureError(`No content available`); } return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } getDomainProfileUrlDigest(url) { const pathname = url.pathname; const pathVec = pathname.split('/'); const parentPath = pathVec.slice(0, -1).join('/'); const finalPath = parentPath || pathname; const key = url.origin.toLocaleLowerCase() + finalPath; return { digest: snapshot_formatter_1.md5Hasher.hash(key), path: finalPath, }; } async sideLoadWithAllocatedProxy(url, opts) { if (opts?.allocProxy === 'none') { return this.curlControl.sideLoad(url, opts); } let proxy; if (opts) { let it = this.proxyIterMap.get(opts); if (!it) { it = this.proxyProvider.iterAlloc(this.figureOutBestProxyCountry(opts)); this.proxyIterMap.set(opts, it); } proxy = (await it.next()).value; } proxy ??= await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts)); this.logger.debug(`Proxy allocated`, { proxy: proxy.href }); const r = await this.curlControl.sideLoad(url, { ...opts, proxyUrl: proxy.href, }); if (opts && opts.allocProxy) { opts.proxyUrl ??= proxy.href; } return { ...r, proxy }; } figureOutBestProxyCountry(opts) { if (!opts) { return 'auto'; } let draft; if (opts.allocProxy) { if (this.proxyProvider.supports(opts.allocProxy)) { draft = opts.allocProxy; } else if (opts.allocProxy === 'none') { return 'none'; } } if (opts.countryHint) { if (this.proxyProvider.supports(opts.countryHint)) { draft ??= opts.countryHint; } } draft ??= opts.allocProxy || 'auto'; return draft; } knownUrlThatSideLoadingWouldCrashTheBrowser(url) { if (url.hostname === 'chromewebstore.google.com') { return true; } return false; } async saasAssertTierPolicy(opts, auth) { let chargeScalar = 1; let minimalCharge = 0; if (opts.withGeneratedAlt) { await auth.assertTier(0, 'Alt text generation'); minimalCharge = 765; } if (opts.injectPageScript || opts.injectFrameScript) { await auth.assertTier(0, 'Script injection'); minimalCharge = 4_000; } if (opts.withIframe) { await auth.assertTier(0, 'Iframe'); } if (opts.engine === crawler_options_1.ENGINE_TYPE.CF_BROWSER_RENDERING) { await auth.assertTier(0, 'Cloudflare browser rendering'); minimalCharge = 4_000; } if (opts.respondWith.includes('lm') || opts.engine?.includes('lm')) { await auth.assertTier(0, 'Language model'); minimalCharge = 4_000; chargeScalar = 3; } if (opts.proxy && opts.proxy !== 'none') { await auth.assertTier(['auto', 'any'].includes(opts.proxy) ? 0 : 2, 'Proxy allocation'); chargeScalar = 5; } return { budget: opts.tokenBudget || 0, chargeScalar, minimalCharge, }; } saasApplyTierPolicy(policy, chargeAmount) { const effectiveChargeAmount = policy.chargeScalar * Math.max(chargeAmount, policy.minimalCharge); if (policy.budget && policy.budget < effectiveChargeAmount) { throw new errors_1.BudgetExceededError(`Token budget (${policy.budget}) exceeded, intended charge amount ${effectiveChargeAmount}`); } return effectiveChargeAmount; } }; exports.CrawlerHost = CrawlerHost; __decorate([ (0, registry_1.Method)({ name: 'getIndex', description: 'Index of the service', proto: { http: { action: 'get', path: '/', } }, tags: ['misc', 'crawl'], returnType: [String, Object], }), __param(0, (0, registry_1.Ctx)()), __param(1, (0, registry_1.Param)({ required: false })), __metadata("design:type", Function), __metadata("design:paramtypes", [typeof (_c = typeof registry_1.Context !== "undefined" && registry_1.Context) === "function" ? _c : Object, jina_embeddings_auth_1.JinaEmbeddingsAuthDTO]), __metadata("design:returntype", Promise) ], CrawlerHost.prototype, "getIndexCtrl", null); __decorate([ (0, registry_1.Method)({ name: 'crawlByPostingToIndex', description: 'Crawl any url into markdown', proto: { http: { action: 'POST', path: '/', } }, tags: ['crawl'], returnType: [String, transform_server_event_stream_1.OutputServerEventStream], }), (0, registry_1.Method)({ description: 'Crawl any url into markdown', proto: { http: { action: ['GET', 'POST'], path: '::url', } }, tags: ['crawl'], returnType: [String, transform_server_event_stream_1.OutputServerEventStream, civ_rpc_1.RawString], }), __param(0, (0, registry_1.RPCReflect)()), __param(1, (0, registry_1.Ctx)()), __metadata("design:type", Function), __metadata("design:paramtypes", [typeof (_d = typeof civ_rpc_1.RPCReflection !== "undefined" && civ_rpc_1.RPCReflection) === "function" ? _d : Object, typeof (_e = typeof registry_1.Context !== "undefined" && registry_1.Context) === "function" ? _e : Object, jina_embeddings_auth_1.JinaEmbeddingsAuthDTO, crawler_options_1.CrawlerOptionsHeaderOnly, crawler_options_1.CrawlerOptions]), __metadata("design:returntype", Promise) ], CrawlerHost.prototype, "crawl", null); __decorate([ (0, decorators_1.retryWith)((err) => { if (err instanceof errors_1.ServiceBadApproachError) { return false; } if (err instanceof errors_1.ServiceBadAttemptError) { // Keep trying return true; } if (err instanceof civ_rpc_1.ApplicationError) { // Quit with this error return false; } return undefined; }, 3), __metadata("design:type", Function), __metadata("design:paramtypes", [typeof (_f = typeof URL !== "undefined" && URL) === "function" ? _f : Object, Object]), __metadata("design:returntype", Promise) ], CrawlerHost.prototype, "sideLoadWithAllocatedProxy", null); exports.CrawlerHost = CrawlerHost = __decorate([ (0, tsyringe_1.singleton)(), __metadata("design:paramtypes", [logger_1.GlobalLogger, puppeteer_1.PuppeteerControl, curl_1.CurlControl, cf_browser_rendering_1.CFBrowserRendering, typeof (_a = typeof proxy_provider_1.ProxyProviderService !== "undefined" && proxy_provider_1.ProxyProviderService) === "function" ? _a : Object, lm_1.LmControl, jsdom_1.JSDomControl, snapshot_formatter_1.SnapshotFormatter, firebase_storage_bucket_1.FirebaseStorageBucketControl, typeof (_b = typeof rate_limit_1.RateLimitControl !== "undefined" && rate_limit_1.RateLimitControl) === "function" ? _b : Object, async_context_1.AsyncLocalContext, robots_text_1.RobotsTxtService, temp_file_1.TempFileManager, geoip_1.GeoIPService, misc_2.MiscService]) ], CrawlerHost); //# sourceMappingURL=crawler.js.map