"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var __param = (this && this.__param) || function (paramIndex, decorator) { return function (target, key) { decorator(target, key, paramIndex); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; var _a, _b, _c, _d, _e; Object.defineProperty(exports, "__esModule", { value: true }); exports.SerpHost = void 0; const tsyringe_1 = require("tsyringe"); const civ_rpc_1 = require("civkit/civ-rpc"); const lang_1 = require("civkit/lang"); const lodash_1 = __importDefault(require("lodash")); const rate_limit_1 = require("../shared/services/rate-limit"); const logger_1 = require("../services/logger"); const async_context_1 = require("../services/async-context"); const registry_1 = require("../services/registry"); const transform_server_event_stream_1 = require("../lib/transform-server-event-stream"); const jina_embeddings_auth_1 = require("../dto/jina-embeddings-auth"); const errors_1 = require("../services/errors"); const serper_search_1 = require("../shared/3rd-party/serper-search"); const google_1 = require("../services/serp/google"); const crawler_options_1 = require("../dto/crawler-options"); const hash_1 = require("civkit/hash"); const searched_1 = require("../db/searched"); const serper_1 = require("../services/serp/serper"); const lru_cache_1 = require("lru-cache"); const api_roll_1 = require("../shared/db/api-roll"); const internal_1 = require("../services/serp/internal"); const WORLD_COUNTRY_CODES = Object.keys(serper_search_1.WORLD_COUNTRIES).map((x) => x.toLowerCase()); const indexProto = { toString: function () { return (0, lodash_1.default)(this) .toPairs() .map(([k, v]) => k ? `[${lodash_1.default.upperFirst(lodash_1.default.lowerCase(k))}] ${v}` : '') .value() .join('\n') + '\n'; } }; let SerpHost = class SerpHost extends civ_rpc_1.RPCHost { async getIndex(ctx, auth) { const indexObject = Object.create(indexProto); Object.assign(indexObject, { usage1: 'https://r.jina.ai/YOUR_URL', usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', usage3: `${ctx.origin}/?q=YOUR_SEARCH_QUERY`, homepage: 'https://jina.ai/reader', }); if (auth && auth.user) { indexObject[''] = undefined; indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`; indexObject.balanceLeft = auth.user.wallet.total_balance; } else { indexObject.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.'; } return indexObject; } constructor(globalLogger, rateLimitControl, threadLocal, googleSerp, serperGoogle, serperBing, jinaSerp) { super(...arguments); this.globalLogger = globalLogger; this.rateLimitControl = rateLimitControl; this.threadLocal = threadLocal; this.googleSerp = googleSerp; this.serperGoogle = serperGoogle; this.serperBing = serperBing; this.jinaSerp = jinaSerp; this.logger = this.globalLogger.child({ service: this.constructor.name }); this.cacheRetentionMs = 1000 * 3600 * 24 * 7; this.cacheValidMs = 1000 * 3600; this.pageCacheToleranceMs = 1000 * 3600 * 24; this.reasonableDelayMs = 15_000; this.targetResultCount = 5; this.highFreqKeyCache = new lru_cache_1.LRUCache({ max: 256, ttl: 60 * 60 * 1000, updateAgeOnGet: false, updateAgeOnHas: false, }); this.batchedCaches = []; setInterval(() => { const thisBatch = this.batchedCaches; this.batchedCaches = []; if (!thisBatch.length) { return; } const batch = searched_1.SERPResult.DB.batch(); for (const x of thisBatch) { batch.set(searched_1.SERPResult.COLLECTION.doc(), x.degradeForFireStore()); } batch.commit() .then(() => { this.logger.debug(`Saved ${thisBatch.length} caches by batch`); }) .catch((err) => { this.logger.warn(`Failed to cache search result in batch`, { err }); }); }, 1000 * 10 + Math.round(1000 * Math.random())).unref(); } async init() { await this.dependencyReady(); this.emit('ready'); } async search(rpcReflect, ctx, crawlerOptions, auth, variant, q, searchEngine, num, gl, _hl, location, page, fallback) { const authToken = auth.bearerToken; let highFreqKey; if (authToken && this.highFreqKeyCache.has(authToken)) { highFreqKey = this.highFreqKeyCache.get(authToken); auth.user = highFreqKey.user; auth.uid = highFreqKey.user?.user_id; } const uid = await auth.solveUID(); if (!q) { if (ctx.path === '/') { const indexObject = await this.getIndex(ctx, auth); if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { return indexObject; } return (0, civ_rpc_1.assignTransferProtocolMeta)(`${indexObject}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } throw new civ_rpc_1.ParamValidationError({ path: 'q', message: `Required but not provided` }); } // Return content by default const user = await auth.assertUser(); if (!(user.wallet.total_balance > 0)) { throw new errors_1.InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`); } if (highFreqKey?.blockedUntil) { const now = new Date(); const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf()); if (blockedTimeRemaining > 0) { this.logger.warn(`Rate limit triggered for ${uid}, this request should have been blocked`); // throw RateLimitTriggeredError.from({ // message: `Per UID rate limit exceeded (async)`, // retryAfter: Math.ceil(blockedTimeRemaining / 1000), // }); } } const PREMIUM_KEY_LIMIT = 400; const rateLimitPolicy = auth.getRateLimits('SEARCH') || [ parseInt(user.metadata?.speed_level) >= 2 ? rate_limit_1.RateLimitDesc.from({ occurrence: PREMIUM_KEY_LIMIT, periodSeconds: 60 }) : rate_limit_1.RateLimitDesc.from({ occurrence: 40, periodSeconds: 60 }) ]; const apiRollPromise = this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['SEARCH'], ...rateLimitPolicy); if (!highFreqKey) { // Normal path await apiRollPromise; if (rateLimitPolicy.some((x) => { const rpm = x.occurrence / (x.periodSeconds / 60); if (rpm >= PREMIUM_KEY_LIMIT) { return true; } return false; })) { this.highFreqKeyCache.set(auth.bearerToken, { user, }); } } else { // High freq key path apiRollPromise.then( // Rate limit not triggered, make sure not blocking. () => { delete highFreqKey.blockedUntil; }, // Rate limit triggered (err) => { if (!(err instanceof rate_limit_1.RateLimitTriggeredError)) { return; } const now = Date.now(); let tgtDate; if (err.retryAfterDate) { tgtDate = err.retryAfterDate; } else if (err.retryAfter) { tgtDate = new Date(now + err.retryAfter * 1000); } if (tgtDate) { const dt = tgtDate.valueOf() - now; highFreqKey.blockedUntil = tgtDate; setTimeout(() => { if (highFreqKey.blockedUntil === tgtDate) { delete highFreqKey.blockedUntil; } }, dt).unref(); } }).finally(async () => { // Always asynchronously update user(wallet); const user = await auth.getBrief().catch(() => undefined); if (user) { highFreqKey.user = user; } }); } let chargeAmount = 0; rpcReflect.finally(async () => { if (chargeAmount) { auth.reportUsage(chargeAmount, `reader-search`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: (0, lang_1.marshalErrorLike)(err) }); }); try { const apiRoll = await apiRollPromise; apiRoll.chargeAmount = chargeAmount; } catch (err) { await this.rateLimitControl.record({ uid, tags: [rpcReflect.name.toUpperCase()], status: api_roll_1.API_CALL_STATUS.SUCCESS, chargeAmount, }).save().catch((err) => { this.logger.warn(`Failed to save rate limit record`, { err: (0, lang_1.marshalErrorLike)(err) }); }); } } }); let chargeAmountScaler = 1; if (searchEngine === 'bing') { chargeAmountScaler = 3; } if (variant !== 'web') { chargeAmountScaler = 5; } let realQuery = q; let queryTerms = q.split(/\s+/g).filter((x) => !!x); let results = await this.cachedSearch(variant, { provider: searchEngine, q, num, gl, // hl, location, page, }, crawlerOptions); if (fallback && !results?.length && (!page || page === 1)) { let tryTimes = 1; const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q); const lastResort = (containsRTL ? queryTerms.slice(queryTerms.length - 2) : queryTerms.slice(0, 2)).join(' '); const n = 4; let terms = []; while (tryTimes < n) { const delta = Math.ceil(queryTerms.length / n) * tryTimes; terms = containsRTL ? queryTerms.slice(delta) : queryTerms.slice(0, queryTerms.length - delta); const query = terms.join(' '); if (!query) { break; } if (realQuery === query) { continue; } tryTimes += 1; realQuery = query; this.logger.info(`Retrying search with fallback query: "${realQuery}"`); results = await this.cachedSearch(variant, { provider: searchEngine, q: realQuery, num, gl, // hl, location, }, crawlerOptions); if (results?.length) { break; } } if (!results?.length && realQuery.length > lastResort.length) { realQuery = lastResort; this.logger.info(`Retrying search with fallback query: "${realQuery}"`); tryTimes += 1; results = await this.cachedSearch(variant, { provider: searchEngine, q: realQuery, num, gl, // hl, location, }, crawlerOptions); } chargeAmountScaler *= tryTimes; } if (!results?.length) { results = []; } const finalResults = results.map((x) => this.mapToFinalResults(x)); await Promise.all(finalResults.map((x) => this.assignGeneralMixin(x))); chargeAmount = this.assignChargeAmount(finalResults, chargeAmountScaler); (0, civ_rpc_1.assignMeta)(finalResults, { query: realQuery, fallback: realQuery === q ? undefined : realQuery, }); return finalResults; } assignChargeAmount(items, scaler) { const numCharge = Math.ceil(items.length / 10) * 10000 * scaler; (0, civ_rpc_1.assignMeta)(items, { usage: { tokens: numCharge } }); return numCharge; } async getFavicon(domain) { const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`; try { const response = await fetch(url); if (!response.ok) { return ''; } const ab = await response.arrayBuffer(); const buffer = Buffer.from(ab); const base64 = buffer.toString('base64'); return `data:image/png;base64,${base64}`; } catch (error) { this.logger.warn(`Failed to get favicon base64 string`, { err: (0, lang_1.marshalErrorLike)(error) }); return ''; } } async configure(opts) { const crawlOpts = { proxyUrl: opts.proxyUrl, cookies: opts.setCookies, overrideUserAgent: opts.userAgent, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, locale: opts.locale, referer: opts.referer, viewport: opts.viewport, proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false, allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy, }; if (opts.locale) { crawlOpts.extraHeaders ??= {}; crawlOpts.extraHeaders['Accept-Language'] = opts.locale; } return crawlOpts; } mapToFinalResults(input) { const whitelistedProps = [ 'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks' ]; const result = { title: input.title, url: input.link, description: Reflect.get(input, 'snippet'), ...lodash_1.default.pick(input, whitelistedProps), }; return result; } *iterProviders(preference, variant) { if (preference === 'bing') { yield this.serperBing; yield this.serperGoogle; yield this.googleSerp; return; } if (preference === 'google') { yield this.googleSerp; yield this.googleSerp; yield this.serperGoogle; return; } // yield variant === 'web' ? this.jinaSerp : this.serperGoogle; yield this.serperGoogle; yield this.serperGoogle; yield this.googleSerp; } async cachedSearch(variant, query, opts) { const queryDigest = (0, hash_1.objHashMd5B64Of)({ ...query, variant }); const provider = query.provider; Reflect.deleteProperty(query, 'provider'); const noCache = opts.noCache; let cache; if (!noCache) { cache = (await searched_1.SERPResult.fromFirestoreQuery(searched_1.SERPResult.COLLECTION.where('queryDigest', '==', queryDigest) .orderBy('createdAt', 'desc') .limit(1)))[0]; if (cache) { const age = Date.now() - cache.createdAt.valueOf(); const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs); this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, { query, digest: queryDigest, age, stale }); if (!stale) { return cache.response; } } } const scrappingOptions = await this.configure(opts); try { let r; let lastError; outerLoop: for (const client of this.iterProviders(provider, variant)) { const t0 = Date.now(); try { switch (variant) { case 'images': { r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]); break; } case 'news': { r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]); break; } case 'web': default: { r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]); break; } } const dt = Date.now() - t0; this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name }); break outerLoop; } catch (err) { lastError = err; const dt = Date.now() - t0; this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, }); } } if (r?.length) { const nowDate = new Date(); const record = searched_1.SERPResult.from({ query, queryDigest, response: r, createdAt: nowDate, expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs) }); this.batchedCaches.push(record); } else if (lastError) { throw lastError; } return r; } catch (err) { if (cache) { this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: (0, lang_1.marshalErrorLike)(err) }); return cache.response; } throw err; } } async assignGeneralMixin(result) { const collectFavicon = this.threadLocal.get('collect-favicon'); if (collectFavicon && result.link) { const url = new URL(result.link); Reflect.set(result, 'favicon', await this.getFavicon(url.origin)); } } }; exports.SerpHost = SerpHost; __decorate([ (0, registry_1.Method)({ name: 'searchIndex', ext: { http: { action: ['get', 'post'], path: '/' } }, tags: ['search'], returnType: [String, transform_server_event_stream_1.OutputServerEventStream, civ_rpc_1.RawString], }), (0, registry_1.Method)({ ext: { http: { action: ['get', 'post'], } }, tags: ['search'], returnType: [String, transform_server_event_stream_1.OutputServerEventStream, civ_rpc_1.RawString], }), __param(0, (0, registry_1.RPCReflect)()), __param(1, (0, registry_1.Ctx)()), __param(4, (0, registry_1.Param)('type', { type: new Set(['web', 'images', 'news']), default: 'web' })), __param(5, (0, registry_1.Param)('q')), __param(6, (0, registry_1.Param)('provider', { type: new Set(['google', 'bing']) })), __param(7, (0, registry_1.Param)('num', { validate: (v) => v >= 0 && v <= 20 })), __param(8, (0, registry_1.Param)('gl', { validate: (v) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) })), __param(9, (0, registry_1.Param)('hl', { validate: (v) => serper_search_1.WORLD_LANGUAGES.some(l => l.code === v) })), __param(10, (0, registry_1.Param)('location')), __param(11, (0, registry_1.Param)('page')), __param(12, (0, registry_1.Param)('fallback')), __metadata("design:type", Function), __metadata("design:paramtypes", [typeof (_d = typeof civ_rpc_1.RPCReflection !== "undefined" && civ_rpc_1.RPCReflection) === "function" ? _d : Object, typeof (_e = typeof registry_1.Context !== "undefined" && registry_1.Context) === "function" ? _e : Object, crawler_options_1.CrawlerOptions, jina_embeddings_auth_1.JinaEmbeddingsAuthDTO, String, String, String, Number, String, String, String, Number, Boolean]), __metadata("design:returntype", Promise) ], SerpHost.prototype, "search", null); exports.SerpHost = SerpHost = __decorate([ (0, tsyringe_1.singleton)(), __metadata("design:paramtypes", [logger_1.GlobalLogger, typeof (_a = typeof rate_limit_1.RateLimitControl !== "undefined" && rate_limit_1.RateLimitControl) === "function" ? _a : Object, async_context_1.AsyncLocalContext, google_1.GoogleSERP, typeof (_b = typeof serper_1.SerperGoogleSearchService !== "undefined" && serper_1.SerperGoogleSearchService) === "function" ? _b : Object, typeof (_c = typeof serper_1.SerperBingSearchService !== "undefined" && serper_1.SerperBingSearchService) === "function" ? _c : Object, internal_1.InternalJinaSerpService]) ], SerpHost); //# sourceMappingURL=serp.js.map