Spaces:
Build error
Build error
| ; | |
| var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { | |
| var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; | |
| if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); | |
| else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; | |
| return c > 3 && r && Object.defineProperty(target, key, r), r; | |
| }; | |
| var __metadata = (this && this.__metadata) || function (k, v) { | |
| if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); | |
| }; | |
| var __param = (this && this.__param) || function (paramIndex, decorator) { | |
| return function (target, key) { decorator(target, key, paramIndex); } | |
| }; | |
| var __importDefault = (this && this.__importDefault) || function (mod) { | |
| return (mod && mod.__esModule) ? mod : { "default": mod }; | |
| }; | |
| var _a, _b, _c, _d, _e; | |
| Object.defineProperty(exports, "__esModule", { value: true }); | |
| exports.SerpHost = void 0; | |
| const tsyringe_1 = require("tsyringe"); | |
| const civ_rpc_1 = require("civkit/civ-rpc"); | |
| const lang_1 = require("civkit/lang"); | |
| const lodash_1 = __importDefault(require("lodash")); | |
| const rate_limit_1 = require("../shared/services/rate-limit"); | |
| const logger_1 = require("../services/logger"); | |
| const async_context_1 = require("../services/async-context"); | |
| const registry_1 = require("../services/registry"); | |
| const transform_server_event_stream_1 = require("../lib/transform-server-event-stream"); | |
| const jina_embeddings_auth_1 = require("../dto/jina-embeddings-auth"); | |
| const errors_1 = require("../services/errors"); | |
| const serper_search_1 = require("../shared/3rd-party/serper-search"); | |
| const google_1 = require("../services/serp/google"); | |
| const crawler_options_1 = require("../dto/crawler-options"); | |
| const hash_1 = require("civkit/hash"); | |
| const searched_1 = require("../db/searched"); | |
| const serper_1 = require("../services/serp/serper"); | |
| const lru_cache_1 = require("lru-cache"); | |
| const api_roll_1 = require("../shared/db/api-roll"); | |
| const internal_1 = require("../services/serp/internal"); | |
| const WORLD_COUNTRY_CODES = Object.keys(serper_search_1.WORLD_COUNTRIES).map((x) => x.toLowerCase()); | |
| const indexProto = { | |
| toString: function () { | |
| return (0, lodash_1.default)(this) | |
| .toPairs() | |
| .map(([k, v]) => k ? `[${lodash_1.default.upperFirst(lodash_1.default.lowerCase(k))}] ${v}` : '') | |
| .value() | |
| .join('\n') + '\n'; | |
| } | |
| }; | |
| let SerpHost = class SerpHost extends civ_rpc_1.RPCHost { | |
| async getIndex(ctx, auth) { | |
| const indexObject = Object.create(indexProto); | |
| Object.assign(indexObject, { | |
| usage1: 'https://r.jina.ai/YOUR_URL', | |
| usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', | |
| usage3: `${ctx.origin}/?q=YOUR_SEARCH_QUERY`, | |
| homepage: 'https://jina.ai/reader', | |
| }); | |
| if (auth && auth.user) { | |
| indexObject[''] = undefined; | |
| indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`; | |
| indexObject.balanceLeft = auth.user.wallet.total_balance; | |
| } | |
| else { | |
| indexObject.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.'; | |
| } | |
| return indexObject; | |
| } | |
| constructor(globalLogger, rateLimitControl, threadLocal, googleSerp, serperGoogle, serperBing, jinaSerp) { | |
| super(...arguments); | |
| this.globalLogger = globalLogger; | |
| this.rateLimitControl = rateLimitControl; | |
| this.threadLocal = threadLocal; | |
| this.googleSerp = googleSerp; | |
| this.serperGoogle = serperGoogle; | |
| this.serperBing = serperBing; | |
| this.jinaSerp = jinaSerp; | |
| this.logger = this.globalLogger.child({ service: this.constructor.name }); | |
| this.cacheRetentionMs = 1000 * 3600 * 24 * 7; | |
| this.cacheValidMs = 1000 * 3600; | |
| this.pageCacheToleranceMs = 1000 * 3600 * 24; | |
| this.reasonableDelayMs = 15_000; | |
| this.targetResultCount = 5; | |
| this.highFreqKeyCache = new lru_cache_1.LRUCache({ | |
| max: 256, | |
| ttl: 60 * 60 * 1000, | |
| updateAgeOnGet: false, | |
| updateAgeOnHas: false, | |
| }); | |
| this.batchedCaches = []; | |
| setInterval(() => { | |
| const thisBatch = this.batchedCaches; | |
| this.batchedCaches = []; | |
| if (!thisBatch.length) { | |
| return; | |
| } | |
| const batch = searched_1.SERPResult.DB.batch(); | |
| for (const x of thisBatch) { | |
| batch.set(searched_1.SERPResult.COLLECTION.doc(), x.degradeForFireStore()); | |
| } | |
| batch.commit() | |
| .then(() => { | |
| this.logger.debug(`Saved ${thisBatch.length} caches by batch`); | |
| }) | |
| .catch((err) => { | |
| this.logger.warn(`Failed to cache search result in batch`, { err }); | |
| }); | |
| }, 1000 * 10 + Math.round(1000 * Math.random())).unref(); | |
| } | |
| async init() { | |
| await this.dependencyReady(); | |
| this.emit('ready'); | |
| } | |
| async search(rpcReflect, ctx, crawlerOptions, auth, variant, q, searchEngine, num, gl, _hl, location, page, fallback) { | |
| const authToken = auth.bearerToken; | |
| let highFreqKey; | |
| if (authToken && this.highFreqKeyCache.has(authToken)) { | |
| highFreqKey = this.highFreqKeyCache.get(authToken); | |
| auth.user = highFreqKey.user; | |
| auth.uid = highFreqKey.user?.user_id; | |
| } | |
| const uid = await auth.solveUID(); | |
| if (!q) { | |
| if (ctx.path === '/') { | |
| const indexObject = await this.getIndex(ctx, auth); | |
| if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { | |
| return indexObject; | |
| } | |
| return (0, civ_rpc_1.assignTransferProtocolMeta)(`${indexObject}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); | |
| } | |
| throw new civ_rpc_1.ParamValidationError({ | |
| path: 'q', | |
| message: `Required but not provided` | |
| }); | |
| } | |
| // Return content by default | |
| const user = await auth.assertUser(); | |
| if (!(user.wallet.total_balance > 0)) { | |
| throw new errors_1.InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`); | |
| } | |
| if (highFreqKey?.blockedUntil) { | |
| const now = new Date(); | |
| const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf()); | |
| if (blockedTimeRemaining > 0) { | |
| this.logger.warn(`Rate limit triggered for ${uid}, this request should have been blocked`); | |
| // throw RateLimitTriggeredError.from({ | |
| // message: `Per UID rate limit exceeded (async)`, | |
| // retryAfter: Math.ceil(blockedTimeRemaining / 1000), | |
| // }); | |
| } | |
| } | |
| const PREMIUM_KEY_LIMIT = 400; | |
| const rateLimitPolicy = auth.getRateLimits('SEARCH') || [ | |
| parseInt(user.metadata?.speed_level) >= 2 ? | |
| rate_limit_1.RateLimitDesc.from({ | |
| occurrence: PREMIUM_KEY_LIMIT, | |
| periodSeconds: 60 | |
| }) : | |
| rate_limit_1.RateLimitDesc.from({ | |
| occurrence: 40, | |
| periodSeconds: 60 | |
| }) | |
| ]; | |
| const apiRollPromise = this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['SEARCH'], ...rateLimitPolicy); | |
| if (!highFreqKey) { | |
| // Normal path | |
| await apiRollPromise; | |
| if (rateLimitPolicy.some((x) => { | |
| const rpm = x.occurrence / (x.periodSeconds / 60); | |
| if (rpm >= PREMIUM_KEY_LIMIT) { | |
| return true; | |
| } | |
| return false; | |
| })) { | |
| this.highFreqKeyCache.set(auth.bearerToken, { | |
| user, | |
| }); | |
| } | |
| } | |
| else { | |
| // High freq key path | |
| apiRollPromise.then( | |
| // Rate limit not triggered, make sure not blocking. | |
| () => { | |
| delete highFreqKey.blockedUntil; | |
| }, | |
| // Rate limit triggered | |
| (err) => { | |
| if (!(err instanceof rate_limit_1.RateLimitTriggeredError)) { | |
| return; | |
| } | |
| const now = Date.now(); | |
| let tgtDate; | |
| if (err.retryAfterDate) { | |
| tgtDate = err.retryAfterDate; | |
| } | |
| else if (err.retryAfter) { | |
| tgtDate = new Date(now + err.retryAfter * 1000); | |
| } | |
| if (tgtDate) { | |
| const dt = tgtDate.valueOf() - now; | |
| highFreqKey.blockedUntil = tgtDate; | |
| setTimeout(() => { | |
| if (highFreqKey.blockedUntil === tgtDate) { | |
| delete highFreqKey.blockedUntil; | |
| } | |
| }, dt).unref(); | |
| } | |
| }).finally(async () => { | |
| // Always asynchronously update user(wallet); | |
| const user = await auth.getBrief().catch(() => undefined); | |
| if (user) { | |
| highFreqKey.user = user; | |
| } | |
| }); | |
| } | |
| let chargeAmount = 0; | |
| rpcReflect.finally(async () => { | |
| if (chargeAmount) { | |
| auth.reportUsage(chargeAmount, `reader-search`).catch((err) => { | |
| this.logger.warn(`Unable to report usage for ${uid}`, { err: (0, lang_1.marshalErrorLike)(err) }); | |
| }); | |
| try { | |
| const apiRoll = await apiRollPromise; | |
| apiRoll.chargeAmount = chargeAmount; | |
| } | |
| catch (err) { | |
| await this.rateLimitControl.record({ | |
| uid, | |
| tags: [rpcReflect.name.toUpperCase()], | |
| status: api_roll_1.API_CALL_STATUS.SUCCESS, | |
| chargeAmount, | |
| }).save().catch((err) => { | |
| this.logger.warn(`Failed to save rate limit record`, { err: (0, lang_1.marshalErrorLike)(err) }); | |
| }); | |
| } | |
| } | |
| }); | |
| let chargeAmountScaler = 1; | |
| if (searchEngine === 'bing') { | |
| chargeAmountScaler = 3; | |
| } | |
| if (variant !== 'web') { | |
| chargeAmountScaler = 5; | |
| } | |
| let realQuery = q; | |
| let queryTerms = q.split(/\s+/g).filter((x) => !!x); | |
| let results = await this.cachedSearch(variant, { | |
| provider: searchEngine, | |
| q, | |
| num, | |
| gl, | |
| // hl, | |
| location, | |
| page, | |
| }, crawlerOptions); | |
| if (fallback && !results?.length && (!page || page === 1)) { | |
| let tryTimes = 1; | |
| const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q); | |
| const lastResort = (containsRTL ? queryTerms.slice(queryTerms.length - 2) : queryTerms.slice(0, 2)).join(' '); | |
| const n = 4; | |
| let terms = []; | |
| while (tryTimes < n) { | |
| const delta = Math.ceil(queryTerms.length / n) * tryTimes; | |
| terms = containsRTL ? queryTerms.slice(delta) : queryTerms.slice(0, queryTerms.length - delta); | |
| const query = terms.join(' '); | |
| if (!query) { | |
| break; | |
| } | |
| if (realQuery === query) { | |
| continue; | |
| } | |
| tryTimes += 1; | |
| realQuery = query; | |
| this.logger.info(`Retrying search with fallback query: "${realQuery}"`); | |
| results = await this.cachedSearch(variant, { | |
| provider: searchEngine, | |
| q: realQuery, | |
| num, | |
| gl, | |
| // hl, | |
| location, | |
| }, crawlerOptions); | |
| if (results?.length) { | |
| break; | |
| } | |
| } | |
| if (!results?.length && realQuery.length > lastResort.length) { | |
| realQuery = lastResort; | |
| this.logger.info(`Retrying search with fallback query: "${realQuery}"`); | |
| tryTimes += 1; | |
| results = await this.cachedSearch(variant, { | |
| provider: searchEngine, | |
| q: realQuery, | |
| num, | |
| gl, | |
| // hl, | |
| location, | |
| }, crawlerOptions); | |
| } | |
| chargeAmountScaler *= tryTimes; | |
| } | |
| if (!results?.length) { | |
| results = []; | |
| } | |
| const finalResults = results.map((x) => this.mapToFinalResults(x)); | |
| await Promise.all(finalResults.map((x) => this.assignGeneralMixin(x))); | |
| chargeAmount = this.assignChargeAmount(finalResults, chargeAmountScaler); | |
| (0, civ_rpc_1.assignMeta)(finalResults, { | |
| query: realQuery, | |
| fallback: realQuery === q ? undefined : realQuery, | |
| }); | |
| return finalResults; | |
| } | |
| assignChargeAmount(items, scaler) { | |
| const numCharge = Math.ceil(items.length / 10) * 10000 * scaler; | |
| (0, civ_rpc_1.assignMeta)(items, { usage: { tokens: numCharge } }); | |
| return numCharge; | |
| } | |
| async getFavicon(domain) { | |
| const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`; | |
| try { | |
| const response = await fetch(url); | |
| if (!response.ok) { | |
| return ''; | |
| } | |
| const ab = await response.arrayBuffer(); | |
| const buffer = Buffer.from(ab); | |
| const base64 = buffer.toString('base64'); | |
| return `data:image/png;base64,${base64}`; | |
| } | |
| catch (error) { | |
| this.logger.warn(`Failed to get favicon base64 string`, { err: (0, lang_1.marshalErrorLike)(error) }); | |
| return ''; | |
| } | |
| } | |
| async configure(opts) { | |
| const crawlOpts = { | |
| proxyUrl: opts.proxyUrl, | |
| cookies: opts.setCookies, | |
| overrideUserAgent: opts.userAgent, | |
| timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, | |
| locale: opts.locale, | |
| referer: opts.referer, | |
| viewport: opts.viewport, | |
| proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false, | |
| allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy, | |
| }; | |
| if (opts.locale) { | |
| crawlOpts.extraHeaders ??= {}; | |
| crawlOpts.extraHeaders['Accept-Language'] = opts.locale; | |
| } | |
| return crawlOpts; | |
| } | |
| mapToFinalResults(input) { | |
| const whitelistedProps = [ | |
| 'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks' | |
| ]; | |
| const result = { | |
| title: input.title, | |
| url: input.link, | |
| description: Reflect.get(input, 'snippet'), | |
| ...lodash_1.default.pick(input, whitelistedProps), | |
| }; | |
| return result; | |
| } | |
| *iterProviders(preference, variant) { | |
| if (preference === 'bing') { | |
| yield this.serperBing; | |
| yield this.serperGoogle; | |
| yield this.googleSerp; | |
| return; | |
| } | |
| if (preference === 'google') { | |
| yield this.googleSerp; | |
| yield this.googleSerp; | |
| yield this.serperGoogle; | |
| return; | |
| } | |
| // yield variant === 'web' ? this.jinaSerp : this.serperGoogle; | |
| yield this.serperGoogle; | |
| yield this.serperGoogle; | |
| yield this.googleSerp; | |
| } | |
| async cachedSearch(variant, query, opts) { | |
| const queryDigest = (0, hash_1.objHashMd5B64Of)({ ...query, variant }); | |
| const provider = query.provider; | |
| Reflect.deleteProperty(query, 'provider'); | |
| const noCache = opts.noCache; | |
| let cache; | |
| if (!noCache) { | |
| cache = (await searched_1.SERPResult.fromFirestoreQuery(searched_1.SERPResult.COLLECTION.where('queryDigest', '==', queryDigest) | |
| .orderBy('createdAt', 'desc') | |
| .limit(1)))[0]; | |
| if (cache) { | |
| const age = Date.now() - cache.createdAt.valueOf(); | |
| const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs); | |
| this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, { | |
| query, digest: queryDigest, age, stale | |
| }); | |
| if (!stale) { | |
| return cache.response; | |
| } | |
| } | |
| } | |
| const scrappingOptions = await this.configure(opts); | |
| try { | |
| let r; | |
| let lastError; | |
| outerLoop: for (const client of this.iterProviders(provider, variant)) { | |
| const t0 = Date.now(); | |
| try { | |
| switch (variant) { | |
| case 'images': { | |
| r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]); | |
| break; | |
| } | |
| case 'news': { | |
| r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]); | |
| break; | |
| } | |
| case 'web': | |
| default: { | |
| r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]); | |
| break; | |
| } | |
| } | |
| const dt = Date.now() - t0; | |
| this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name }); | |
| break outerLoop; | |
| } | |
| catch (err) { | |
| lastError = err; | |
| const dt = Date.now() - t0; | |
| this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, }); | |
| } | |
| } | |
| if (r?.length) { | |
| const nowDate = new Date(); | |
| const record = searched_1.SERPResult.from({ | |
| query, | |
| queryDigest, | |
| response: r, | |
| createdAt: nowDate, | |
| expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs) | |
| }); | |
| this.batchedCaches.push(record); | |
| } | |
| else if (lastError) { | |
| throw lastError; | |
| } | |
| return r; | |
| } | |
| catch (err) { | |
| if (cache) { | |
| this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: (0, lang_1.marshalErrorLike)(err) }); | |
| return cache.response; | |
| } | |
| throw err; | |
| } | |
| } | |
| async assignGeneralMixin(result) { | |
| const collectFavicon = this.threadLocal.get('collect-favicon'); | |
| if (collectFavicon && result.link) { | |
| const url = new URL(result.link); | |
| Reflect.set(result, 'favicon', await this.getFavicon(url.origin)); | |
| } | |
| } | |
| }; | |
| exports.SerpHost = SerpHost; | |
| __decorate([ | |
| (0, registry_1.Method)({ | |
| name: 'searchIndex', | |
| ext: { | |
| http: { | |
| action: ['get', 'post'], | |
| path: '/' | |
| } | |
| }, | |
| tags: ['search'], | |
| returnType: [String, transform_server_event_stream_1.OutputServerEventStream, civ_rpc_1.RawString], | |
| }), | |
| (0, registry_1.Method)({ | |
| ext: { | |
| http: { | |
| action: ['get', 'post'], | |
| } | |
| }, | |
| tags: ['search'], | |
| returnType: [String, transform_server_event_stream_1.OutputServerEventStream, civ_rpc_1.RawString], | |
| }), | |
| __param(0, (0, registry_1.RPCReflect)()), | |
| __param(1, (0, registry_1.Ctx)()), | |
| __param(4, (0, registry_1.Param)('type', { type: new Set(['web', 'images', 'news']), default: 'web' })), | |
| __param(5, (0, registry_1.Param)('q')), | |
| __param(6, (0, registry_1.Param)('provider', { type: new Set(['google', 'bing']) })), | |
| __param(7, (0, registry_1.Param)('num', { validate: (v) => v >= 0 && v <= 20 })), | |
| __param(8, (0, registry_1.Param)('gl', { validate: (v) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) })), | |
| __param(9, (0, registry_1.Param)('hl', { validate: (v) => serper_search_1.WORLD_LANGUAGES.some(l => l.code === v) })), | |
| __param(10, (0, registry_1.Param)('location')), | |
| __param(11, (0, registry_1.Param)('page')), | |
| __param(12, (0, registry_1.Param)('fallback')), | |
| __metadata("design:type", Function), | |
| __metadata("design:paramtypes", [typeof (_d = typeof civ_rpc_1.RPCReflection !== "undefined" && civ_rpc_1.RPCReflection) === "function" ? _d : Object, typeof (_e = typeof registry_1.Context !== "undefined" && registry_1.Context) === "function" ? _e : Object, crawler_options_1.CrawlerOptions, | |
| jina_embeddings_auth_1.JinaEmbeddingsAuthDTO, String, String, String, Number, String, String, String, Number, Boolean]), | |
| __metadata("design:returntype", Promise) | |
| ], SerpHost.prototype, "search", null); | |
| exports.SerpHost = SerpHost = __decorate([ | |
| (0, tsyringe_1.singleton)(), | |
| __metadata("design:paramtypes", [logger_1.GlobalLogger, typeof (_a = typeof rate_limit_1.RateLimitControl !== "undefined" && rate_limit_1.RateLimitControl) === "function" ? _a : Object, async_context_1.AsyncLocalContext, | |
| google_1.GoogleSERP, typeof (_b = typeof serper_1.SerperGoogleSearchService !== "undefined" && serper_1.SerperGoogleSearchService) === "function" ? _b : Object, typeof (_c = typeof serper_1.SerperBingSearchService !== "undefined" && serper_1.SerperBingSearchService) === "function" ? _c : Object, internal_1.InternalJinaSerpService]) | |
| ], SerpHost); | |
| //# sourceMappingURL=serp.js.map |