"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var __param = (this && this.__param) || function (paramIndex, decorator) { return function (target, key) { decorator(target, key, paramIndex); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; var _a, _b, _c, _d, _e, _f; Object.defineProperty(exports, "__esModule", { value: true }); exports.CrawlerHost = void 0; const tsyringe_1 = require("tsyringe"); const url_1 = require("url"); const crypto_1 = require("crypto"); const lodash_1 = __importDefault(require("lodash")); const civ_rpc_1 = require("civkit/civ-rpc"); const lang_1 = require("civkit/lang"); const defer_1 = require("civkit/defer"); const decorators_1 = require("civkit/decorators"); const fancy_file_1 = require("civkit/fancy-file"); const crawler_options_1 = require("../dto/crawler-options"); const crawled_1 = require("../db/crawled"); const domain_blockade_1 = require("../db/domain-blockade"); const transform_server_event_stream_1 = require("../lib/transform-server-event-stream"); const puppeteer_1 = require("../services/puppeteer"); const jsdom_1 = require("../services/jsdom"); const snapshot_formatter_1 = require("../services/snapshot-formatter"); const curl_1 = require("../services/curl"); const lm_1 = require("../services/lm"); const misc_1 = require("../utils/misc"); const cf_browser_rendering_1 = require("../services/cf-browser-rendering"); const logger_1 = require("../services/logger"); const rate_limit_1 = require("../shared/services/rate-limit"); const async_context_1 = require("../services/async-context"); const registry_1 = require("../services/registry"); const errors_1 = require("../services/errors"); const openai_1 = require("../shared/utils/openai"); const proxy_provider_1 = require("../shared/services/proxy-provider"); const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket"); const jina_embeddings_auth_1 = require("../dto/jina-embeddings-auth"); const robots_text_1 = require("../services/robots-text"); const temp_file_1 = require("../services/temp-file"); const misc_2 = require("../services/misc"); const http_1 = require("civkit/http"); const geoip_1 = require("../services/geoip"); const indexProto = { toString: function () { return (0, lodash_1.default)(this) .toPairs() .map(([k, v]) => k ? `[${lodash_1.default.upperFirst(lodash_1.default.lowerCase(k))}] ${v}` : '') .value() .join('\n') + '\n'; } }; let CrawlerHost = class CrawlerHost extends civ_rpc_1.RPCHost { constructor(globalLogger, puppeteerControl, curlControl, cfBrowserRendering, proxyProvider, lmControl, jsdomControl, snapshotFormatter, firebaseObjectStorage, rateLimitControl, threadLocal, robotsTxtService, tempFileManager, geoIpService, miscService) { super(...arguments); this.globalLogger = globalLogger; this.puppeteerControl = puppeteerControl; this.curlControl = curlControl; this.cfBrowserRendering = cfBrowserRendering; this.proxyProvider = proxyProvider; this.lmControl = lmControl; this.jsdomControl = jsdomControl; this.snapshotFormatter = snapshotFormatter; this.firebaseObjectStorage = firebaseObjectStorage; this.rateLimitControl = rateLimitControl; this.threadLocal = threadLocal; this.robotsTxtService = robotsTxtService; this.tempFileManager = tempFileManager; this.geoIpService = geoIpService; this.miscService = miscService; this.logger = this.globalLogger.child({ service: this.constructor.name }); this.cacheRetentionMs = 1000 * 3600 * 24 * 7; this.cacheValidMs = 1000 * 3600; this.urlValidMs = 1000 * 3600 * 4; this.abuseBlockMs = 1000 * 3600; this.domainProfileRetentionMs = 1000 * 3600 * 24 * 30; this.batchedCaches = []; this.proxyIterMap = new WeakMap(); puppeteerControl.on('crawled', async (snapshot, options) => { if (!snapshot.title?.trim() && !snapshot.pdfs?.length) { return; } if (options.cookies?.length || options.private) { // Potential privacy issue, dont cache if cookies are used return; } if (options.injectFrameScripts?.length || options.injectPageScripts?.length || options.viewport) { // Potentially mangeled content, dont cache if scripts are injected return; } if (snapshot.isIntermediate) { return; } if (!snapshot.lastMutationIdle) { // Never reached mutationIdle, presumably too short timeout return; } if (options.locale) { Reflect.set(snapshot, 'locale', options.locale); } const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html); if (analyzed.tokens < 200) { // Does not contain enough content if (snapshot.status !== 200) { return; } if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile')) { return; } } await this.setToCache(options.url, snapshot); }); puppeteerControl.on('abuse', async (abuseEvent) => { this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn }); await domain_blockade_1.DomainBlockade.save(domain_blockade_1.DomainBlockade.from({ domain: abuseEvent.url.hostname.toLowerCase(), triggerReason: `${abuseEvent.reason}`, triggerUrl: abuseEvent.url.toString(), createdAt: new Date(), expireAt: new Date(Date.now() + this.abuseBlockMs), })).catch((err) => { this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: (0, lang_1.marshalErrorLike)(err) }); }); }); setInterval(() => { const thisBatch = this.batchedCaches; this.batchedCaches = []; if (!thisBatch.length) { return; } const batch = crawled_1.Crawled.DB.batch(); for (const x of thisBatch) { batch.set(crawled_1.Crawled.COLLECTION.doc(x._id), x.degradeForFireStore(), { merge: true }); } batch.commit() .then(() => { this.logger.debug(`Saved ${thisBatch.length} caches by batch`); }) .catch((err) => { this.logger.warn(`Failed to save cache in batch`, { err }); }); }, 1000 * 10 + Math.round(1000 * Math.random())).unref(); } async init() { await this.dependencyReady(); if (this.puppeteerControl.effectiveUA) { this.curlControl.impersonateChrome(this.puppeteerControl.effectiveUA); } this.emit('ready'); } async getIndex(auth) { const indexObject = Object.create(indexProto); Object.assign(indexObject, { usage1: 'https://r.jina.ai/YOUR_URL', usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', homepage: 'https://jina.ai/reader', }); await auth?.solveUID(); if (auth && auth.user) { indexObject[''] = undefined; indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`; indexObject.balanceLeft = auth.user.wallet.total_balance; } return indexObject; } async getIndexCtrl(ctx, auth) { const indexObject = await this.getIndex(auth); if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { return indexObject; } return (0, civ_rpc_1.assignTransferProtocolMeta)(`${indexObject}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } async crawl(rpcReflect, ctx, auth, crawlerOptionsHeaderOnly, crawlerOptionsParamsAllowed) { const uid = await auth.solveUID(); let chargeAmount = 0; const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed; const tierPolicy = await this.saasAssertTierPolicy(crawlerOptions, auth); // Use koa ctx.URL, a standard URL object to avoid node.js framework prop naming confusion const targetUrl = await this.getTargetUrl((0, misc_1.tryDecodeURIComponent)(`${ctx.URL.pathname}${ctx.URL.search}`), crawlerOptions); if (!targetUrl) { return await this.getIndex(auth); } // Prevent circular crawling this.puppeteerControl.circuitBreakerHosts.add(ctx.hostname.toLowerCase()); if (uid) { const user = await auth.assertUser(); if (!(user.wallet.total_balance > 0)) { throw new errors_1.InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`); } const rateLimitPolicy = auth.getRateLimits('CRAWL') || [ parseInt(user.metadata?.speed_level) >= 2 ? rate_limit_1.RateLimitDesc.from({ occurrence: 5000, periodSeconds: 60 }) : rate_limit_1.RateLimitDesc.from({ occurrence: 500, periodSeconds: 60 }) ]; const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'], ...rateLimitPolicy); rpcReflect.finally(() => { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { return; } if (chargeAmount) { auth.reportUsage(chargeAmount, `reader-crawl`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: (0, lang_1.marshalErrorLike)(err) }); }); apiRoll.chargeAmount = chargeAmount; } }); } else if (ctx.ip) { const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, ['CRAWL'], [ // 20 requests per minute new Date(Date.now() - 60 * 1000), 20 ]); rpcReflect.finally(() => { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { return; } apiRoll.chargeAmount = chargeAmount; }); } if (!uid) { // Enforce no proxy is allocated for anonymous users due to abuse. crawlerOptions.proxy = 'none'; const blockade = (await domain_blockade_1.DomainBlockade.fromFirestoreQuery(domain_blockade_1.DomainBlockade.COLLECTION .where('domain', '==', targetUrl.hostname.toLowerCase()) .where('expireAt', '>=', new Date()) .limit(1)))[0]; if (blockade) { throw new errors_1.SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); } } const crawlOpts = await this.configure(crawlerOptions); this.logger.info(`Accepting request from ${uid || ctx.ip}`, { opts: crawlerOptions }); if (crawlerOptions.robotsTxt) { await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt); } if (rpcReflect.signal.aborted) { return; } if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) { const sseStream = new transform_server_event_stream_1.OutputServerEventStream(); rpcReflect.return(sseStream); try { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { if (!scrapped) { continue; } if (rpcReflect.signal.aborted) { break; } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); sseStream.write({ event: 'data', data: formatted, }); if (chargeAmount && scrapped.pdfs?.length) { break; } } } catch (err) { this.logger.error(`Failed to crawl ${targetUrl}`, { err: (0, lang_1.marshalErrorLike)(err) }); sseStream.write({ event: 'error', data: (0, lang_1.marshalErrorLike)(err), }); } sseStream.end(); return sseStream; } let lastScrapped; if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { try { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; if (rpcReflect.signal.aborted) { break; } if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { continue; } if (!scrapped.title) { continue; } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (scrapped?.pdfs?.length && !chargeAmount) { continue; } return formatted; } } catch (err) { if (!lastScrapped) { throw err; } } if (!lastScrapped) { if (crawlOpts.targetSelector) { throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`); } throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl}`); } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); return formatted; } if (crawlerOptions.isRequestingCompoundContentFormat()) { throw new civ_rpc_1.ParamValidationError({ path: 'respondWith', message: `You are requesting compound content format, please explicitly accept 'text/event-stream' or 'application/json' in header.` }); } try { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; if (rpcReflect.signal.aborted) { break; } if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { continue; } if (!scrapped.title) { continue; } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }); } if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }); } return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } } catch (err) { if (!lastScrapped) { throw err; } } if (!lastScrapped) { if (crawlOpts.targetSelector) { throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`); } throw new civ_rpc_1.AssertionFailureError(`No content available for URL ${targetUrl}`); } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }); } if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }); } return (0, civ_rpc_1.assignTransferProtocolMeta)(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } async getTargetUrl(originPath, crawlerOptions) { let url = ''; const targetUrlFromGet = originPath.slice(1); if (crawlerOptions.pdf) { const pdfFile = crawlerOptions.pdf; const identifier = pdfFile instanceof fancy_file_1.FancyFile ? (await pdfFile.sha256Sum) : (0, crypto_1.randomUUID)(); url = `blob://pdf/${identifier}`; crawlerOptions.url ??= url; } else if (targetUrlFromGet) { url = targetUrlFromGet.trim(); } else if (crawlerOptions.url) { url = crawlerOptions.url.trim(); } if (!url) { throw new civ_rpc_1.ParamValidationError({ message: 'No URL provided', path: 'url' }); } const { url: safeURL, ips } = await this.miscService.assertNormalizedUrl(url); if (this.puppeteerControl.circuitBreakerHosts.has(safeURL.hostname.toLowerCase())) { throw new errors_1.SecurityCompromiseError({ message: `Circular hostname: ${safeURL.protocol}`, path: 'url' }); } crawlerOptions._hintIps = ips; return safeURL; } getUrlDigest(urlToCrawl) { const normalizedURL = new URL(urlToCrawl); if (!normalizedURL.hash.startsWith('#/')) { normalizedURL.hash = ''; } const normalizedUrl = normalizedURL.toString().toLowerCase(); const digest = snapshot_formatter_1.md5Hasher.hash(normalizedUrl.toString()); return digest; } async *queryCache(urlToCrawl, cacheTolerance) { const digest = this.getUrlDigest(urlToCrawl); const cache = (await (crawled_1.Crawled.fromFirestoreQuery(crawled_1.Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)).catch((err) => { this.logger.warn(`Failed to query cache, unknown issue`, { err }); // https://github.com/grpc/grpc-node/issues/2647 // https://github.com/googleapis/nodejs-firestore/issues/1023 // https://github.com/googleapis/nodejs-firestore/issues/1023 return undefined; })))?.[0]; yield cache; if (!cache) { return; } const age = Date.now() - cache.createdAt.valueOf(); const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance); this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { url: urlToCrawl, digest, age, stale, cacheTolerance }); let snapshot; let screenshotUrl; let pageshotUrl; const preparations = [ this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => { snapshot = JSON.parse(r.toString('utf-8')); }), cache.screenshotAvailable ? this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => { screenshotUrl = r; }) : Promise.resolve(undefined), cache.pageshotAvailable ? this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => { pageshotUrl = r; }) : Promise.resolve(undefined) ]; try { await Promise.all(preparations); } catch (_err) { // Swallow cache errors. return undefined; } yield { isFresh: !stale, ...cache, snapshot: { ...snapshot, screenshot: undefined, pageshot: undefined, screenshotUrl, pageshotUrl, } }; } async setToCache(urlToCrawl, snapshot) { const digest = this.getUrlDigest(urlToCrawl); this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href }); const nowDate = new Date(); const cache = crawled_1.Crawled.from({ _id: (0, crypto_1.randomUUID)(), url: urlToCrawl.toString(), createdAt: nowDate, expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs, urlPathDigest: digest, }); const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`, Buffer.from(JSON.stringify({ ...snapshot, screenshot: undefined, pageshot: undefined, }), 'utf-8'), { metadata: { contentType: 'application/json', } }).then((r) => { cache.snapshotAvailable = true; return r; }); if (snapshot.screenshot) { await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, { metadata: { contentType: 'image/png', } }); cache.screenshotAvailable = true; } if (snapshot.pageshot) { await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, { metadata: { contentType: 'image/png', } }); cache.pageshotAvailable = true; } await savingOfSnapshot; this.batchedCaches.push(cache); // const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { // this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); // return undefined; // }); return cache; } async *iterSnapshots(urlToCrawl, crawlOpts, crawlerOpts) { // if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) { // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, { // ...crawlOpts, engine: ENGINE_TYPE.BROWSER // }, crawlerOpts); // yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot); // return; // } if (crawlerOpts?.respondWith.includes(crawler_options_1.CONTENT_FORMAT.READER_LM)) { const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, { ...crawlOpts, engine: crawlOpts?.engine || crawler_options_1.ENGINE_TYPE.AUTO, }, crawler_options_1.CrawlerOptions.from({ ...crawlerOpts, respondWith: 'html', })); if (!finalAutoSnapshot?.html) { throw new civ_rpc_1.AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`); } if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) { const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined; yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot); return; } try { yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot); } catch (err) { if (err instanceof http_1.HTTPServiceError && err.status === 429) { throw new errors_1.ServiceNodeResourceDrainError(`Reader LM is at capacity, please try again later.`); } throw err; } return; } yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts); } async *cachedScrap(urlToCrawl, crawlOpts, crawlerOpts) { if (crawlerOpts?.html) { const snapshot = { href: urlToCrawl.toString(), html: crawlerOpts.html, title: '', text: '', }; yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); return; } if (crawlerOpts?.pdf) { const pdfFile = crawlerOpts.pdf instanceof fancy_file_1.FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64')); const pdfLocalPath = (0, url_1.pathToFileURL)((await pdfFile.filePath)); const snapshot = { href: urlToCrawl.toString(), html: `