web_reader / build /api /serp.js
Mohammad Shahid
Include pre-built files for HF deployment
f316cce
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var __param = (this && this.__param) || function (paramIndex, decorator) {
return function (target, key) { decorator(target, key, paramIndex); }
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
var _a, _b, _c, _d, _e;
Object.defineProperty(exports, "__esModule", { value: true });
exports.SerpHost = void 0;
const tsyringe_1 = require("tsyringe");
const civ_rpc_1 = require("civkit/civ-rpc");
const lang_1 = require("civkit/lang");
const lodash_1 = __importDefault(require("lodash"));
const rate_limit_1 = require("../shared/services/rate-limit");
const logger_1 = require("../services/logger");
const async_context_1 = require("../services/async-context");
const registry_1 = require("../services/registry");
const transform_server_event_stream_1 = require("../lib/transform-server-event-stream");
const jina_embeddings_auth_1 = require("../dto/jina-embeddings-auth");
const errors_1 = require("../services/errors");
const serper_search_1 = require("../shared/3rd-party/serper-search");
const google_1 = require("../services/serp/google");
const crawler_options_1 = require("../dto/crawler-options");
const hash_1 = require("civkit/hash");
const searched_1 = require("../db/searched");
const serper_1 = require("../services/serp/serper");
const lru_cache_1 = require("lru-cache");
const api_roll_1 = require("../shared/db/api-roll");
const internal_1 = require("../services/serp/internal");
const WORLD_COUNTRY_CODES = Object.keys(serper_search_1.WORLD_COUNTRIES).map((x) => x.toLowerCase());
const indexProto = {
toString: function () {
return (0, lodash_1.default)(this)
.toPairs()
.map(([k, v]) => k ? `[${lodash_1.default.upperFirst(lodash_1.default.lowerCase(k))}] ${v}` : '')
.value()
.join('\n') + '\n';
}
};
let SerpHost = class SerpHost extends civ_rpc_1.RPCHost {
async getIndex(ctx, auth) {
const indexObject = Object.create(indexProto);
Object.assign(indexObject, {
usage1: 'https://r.jina.ai/YOUR_URL',
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
usage3: `${ctx.origin}/?q=YOUR_SEARCH_QUERY`,
homepage: 'https://jina.ai/reader',
});
if (auth && auth.user) {
indexObject[''] = undefined;
indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
indexObject.balanceLeft = auth.user.wallet.total_balance;
}
else {
indexObject.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
}
return indexObject;
}
constructor(globalLogger, rateLimitControl, threadLocal, googleSerp, serperGoogle, serperBing, jinaSerp) {
super(...arguments);
this.globalLogger = globalLogger;
this.rateLimitControl = rateLimitControl;
this.threadLocal = threadLocal;
this.googleSerp = googleSerp;
this.serperGoogle = serperGoogle;
this.serperBing = serperBing;
this.jinaSerp = jinaSerp;
this.logger = this.globalLogger.child({ service: this.constructor.name });
this.cacheRetentionMs = 1000 * 3600 * 24 * 7;
this.cacheValidMs = 1000 * 3600;
this.pageCacheToleranceMs = 1000 * 3600 * 24;
this.reasonableDelayMs = 15_000;
this.targetResultCount = 5;
this.highFreqKeyCache = new lru_cache_1.LRUCache({
max: 256,
ttl: 60 * 60 * 1000,
updateAgeOnGet: false,
updateAgeOnHas: false,
});
this.batchedCaches = [];
setInterval(() => {
const thisBatch = this.batchedCaches;
this.batchedCaches = [];
if (!thisBatch.length) {
return;
}
const batch = searched_1.SERPResult.DB.batch();
for (const x of thisBatch) {
batch.set(searched_1.SERPResult.COLLECTION.doc(), x.degradeForFireStore());
}
batch.commit()
.then(() => {
this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
})
.catch((err) => {
this.logger.warn(`Failed to cache search result in batch`, { err });
});
}, 1000 * 10 + Math.round(1000 * Math.random())).unref();
}
async init() {
await this.dependencyReady();
this.emit('ready');
}
async search(rpcReflect, ctx, crawlerOptions, auth, variant, q, searchEngine, num, gl, _hl, location, page, fallback) {
const authToken = auth.bearerToken;
let highFreqKey;
if (authToken && this.highFreqKeyCache.has(authToken)) {
highFreqKey = this.highFreqKeyCache.get(authToken);
auth.user = highFreqKey.user;
auth.uid = highFreqKey.user?.user_id;
}
const uid = await auth.solveUID();
if (!q) {
if (ctx.path === '/') {
const indexObject = await this.getIndex(ctx, auth);
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
return indexObject;
}
return (0, civ_rpc_1.assignTransferProtocolMeta)(`${indexObject}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
}
throw new civ_rpc_1.ParamValidationError({
path: 'q',
message: `Required but not provided`
});
}
// Return content by default
const user = await auth.assertUser();
if (!(user.wallet.total_balance > 0)) {
throw new errors_1.InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
}
if (highFreqKey?.blockedUntil) {
const now = new Date();
const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf());
if (blockedTimeRemaining > 0) {
this.logger.warn(`Rate limit triggered for ${uid}, this request should have been blocked`);
// throw RateLimitTriggeredError.from({
// message: `Per UID rate limit exceeded (async)`,
// retryAfter: Math.ceil(blockedTimeRemaining / 1000),
// });
}
}
const PREMIUM_KEY_LIMIT = 400;
const rateLimitPolicy = auth.getRateLimits('SEARCH') || [
parseInt(user.metadata?.speed_level) >= 2 ?
rate_limit_1.RateLimitDesc.from({
occurrence: PREMIUM_KEY_LIMIT,
periodSeconds: 60
}) :
rate_limit_1.RateLimitDesc.from({
occurrence: 40,
periodSeconds: 60
})
];
const apiRollPromise = this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['SEARCH'], ...rateLimitPolicy);
if (!highFreqKey) {
// Normal path
await apiRollPromise;
if (rateLimitPolicy.some((x) => {
const rpm = x.occurrence / (x.periodSeconds / 60);
if (rpm >= PREMIUM_KEY_LIMIT) {
return true;
}
return false;
})) {
this.highFreqKeyCache.set(auth.bearerToken, {
user,
});
}
}
else {
// High freq key path
apiRollPromise.then(
// Rate limit not triggered, make sure not blocking.
() => {
delete highFreqKey.blockedUntil;
},
// Rate limit triggered
(err) => {
if (!(err instanceof rate_limit_1.RateLimitTriggeredError)) {
return;
}
const now = Date.now();
let tgtDate;
if (err.retryAfterDate) {
tgtDate = err.retryAfterDate;
}
else if (err.retryAfter) {
tgtDate = new Date(now + err.retryAfter * 1000);
}
if (tgtDate) {
const dt = tgtDate.valueOf() - now;
highFreqKey.blockedUntil = tgtDate;
setTimeout(() => {
if (highFreqKey.blockedUntil === tgtDate) {
delete highFreqKey.blockedUntil;
}
}, dt).unref();
}
}).finally(async () => {
// Always asynchronously update user(wallet);
const user = await auth.getBrief().catch(() => undefined);
if (user) {
highFreqKey.user = user;
}
});
}
let chargeAmount = 0;
rpcReflect.finally(async () => {
if (chargeAmount) {
auth.reportUsage(chargeAmount, `reader-search`).catch((err) => {
this.logger.warn(`Unable to report usage for ${uid}`, { err: (0, lang_1.marshalErrorLike)(err) });
});
try {
const apiRoll = await apiRollPromise;
apiRoll.chargeAmount = chargeAmount;
}
catch (err) {
await this.rateLimitControl.record({
uid,
tags: [rpcReflect.name.toUpperCase()],
status: api_roll_1.API_CALL_STATUS.SUCCESS,
chargeAmount,
}).save().catch((err) => {
this.logger.warn(`Failed to save rate limit record`, { err: (0, lang_1.marshalErrorLike)(err) });
});
}
}
});
let chargeAmountScaler = 1;
if (searchEngine === 'bing') {
chargeAmountScaler = 3;
}
if (variant !== 'web') {
chargeAmountScaler = 5;
}
let realQuery = q;
let queryTerms = q.split(/\s+/g).filter((x) => !!x);
let results = await this.cachedSearch(variant, {
provider: searchEngine,
q,
num,
gl,
// hl,
location,
page,
}, crawlerOptions);
if (fallback && !results?.length && (!page || page === 1)) {
let tryTimes = 1;
const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q);
const lastResort = (containsRTL ? queryTerms.slice(queryTerms.length - 2) : queryTerms.slice(0, 2)).join(' ');
const n = 4;
let terms = [];
while (tryTimes < n) {
const delta = Math.ceil(queryTerms.length / n) * tryTimes;
terms = containsRTL ? queryTerms.slice(delta) : queryTerms.slice(0, queryTerms.length - delta);
const query = terms.join(' ');
if (!query) {
break;
}
if (realQuery === query) {
continue;
}
tryTimes += 1;
realQuery = query;
this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
results = await this.cachedSearch(variant, {
provider: searchEngine,
q: realQuery,
num,
gl,
// hl,
location,
}, crawlerOptions);
if (results?.length) {
break;
}
}
if (!results?.length && realQuery.length > lastResort.length) {
realQuery = lastResort;
this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
tryTimes += 1;
results = await this.cachedSearch(variant, {
provider: searchEngine,
q: realQuery,
num,
gl,
// hl,
location,
}, crawlerOptions);
}
chargeAmountScaler *= tryTimes;
}
if (!results?.length) {
results = [];
}
const finalResults = results.map((x) => this.mapToFinalResults(x));
await Promise.all(finalResults.map((x) => this.assignGeneralMixin(x)));
chargeAmount = this.assignChargeAmount(finalResults, chargeAmountScaler);
(0, civ_rpc_1.assignMeta)(finalResults, {
query: realQuery,
fallback: realQuery === q ? undefined : realQuery,
});
return finalResults;
}
assignChargeAmount(items, scaler) {
const numCharge = Math.ceil(items.length / 10) * 10000 * scaler;
(0, civ_rpc_1.assignMeta)(items, { usage: { tokens: numCharge } });
return numCharge;
}
async getFavicon(domain) {
const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;
try {
const response = await fetch(url);
if (!response.ok) {
return '';
}
const ab = await response.arrayBuffer();
const buffer = Buffer.from(ab);
const base64 = buffer.toString('base64');
return `data:image/png;base64,${base64}`;
}
catch (error) {
this.logger.warn(`Failed to get favicon base64 string`, { err: (0, lang_1.marshalErrorLike)(error) });
return '';
}
}
async configure(opts) {
const crawlOpts = {
proxyUrl: opts.proxyUrl,
cookies: opts.setCookies,
overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
locale: opts.locale,
referer: opts.referer,
viewport: opts.viewport,
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
};
if (opts.locale) {
crawlOpts.extraHeaders ??= {};
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
}
return crawlOpts;
}
mapToFinalResults(input) {
const whitelistedProps = [
'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks'
];
const result = {
title: input.title,
url: input.link,
description: Reflect.get(input, 'snippet'),
...lodash_1.default.pick(input, whitelistedProps),
};
return result;
}
*iterProviders(preference, variant) {
if (preference === 'bing') {
yield this.serperBing;
yield this.serperGoogle;
yield this.googleSerp;
return;
}
if (preference === 'google') {
yield this.googleSerp;
yield this.googleSerp;
yield this.serperGoogle;
return;
}
// yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
yield this.serperGoogle;
yield this.serperGoogle;
yield this.googleSerp;
}
async cachedSearch(variant, query, opts) {
const queryDigest = (0, hash_1.objHashMd5B64Of)({ ...query, variant });
const provider = query.provider;
Reflect.deleteProperty(query, 'provider');
const noCache = opts.noCache;
let cache;
if (!noCache) {
cache = (await searched_1.SERPResult.fromFirestoreQuery(searched_1.SERPResult.COLLECTION.where('queryDigest', '==', queryDigest)
.orderBy('createdAt', 'desc')
.limit(1)))[0];
if (cache) {
const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
query, digest: queryDigest, age, stale
});
if (!stale) {
return cache.response;
}
}
}
const scrappingOptions = await this.configure(opts);
try {
let r;
let lastError;
outerLoop: for (const client of this.iterProviders(provider, variant)) {
const t0 = Date.now();
try {
switch (variant) {
case 'images': {
r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
break;
}
case 'news': {
r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
break;
}
case 'web':
default: {
r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
break;
}
}
const dt = Date.now() - t0;
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
break outerLoop;
}
catch (err) {
lastError = err;
const dt = Date.now() - t0;
this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, });
}
}
if (r?.length) {
const nowDate = new Date();
const record = searched_1.SERPResult.from({
query,
queryDigest,
response: r,
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
});
this.batchedCaches.push(record);
}
else if (lastError) {
throw lastError;
}
return r;
}
catch (err) {
if (cache) {
this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: (0, lang_1.marshalErrorLike)(err) });
return cache.response;
}
throw err;
}
}
async assignGeneralMixin(result) {
const collectFavicon = this.threadLocal.get('collect-favicon');
if (collectFavicon && result.link) {
const url = new URL(result.link);
Reflect.set(result, 'favicon', await this.getFavicon(url.origin));
}
}
};
exports.SerpHost = SerpHost;
__decorate([
(0, registry_1.Method)({
name: 'searchIndex',
ext: {
http: {
action: ['get', 'post'],
path: '/'
}
},
tags: ['search'],
returnType: [String, transform_server_event_stream_1.OutputServerEventStream, civ_rpc_1.RawString],
}),
(0, registry_1.Method)({
ext: {
http: {
action: ['get', 'post'],
}
},
tags: ['search'],
returnType: [String, transform_server_event_stream_1.OutputServerEventStream, civ_rpc_1.RawString],
}),
__param(0, (0, registry_1.RPCReflect)()),
__param(1, (0, registry_1.Ctx)()),
__param(4, (0, registry_1.Param)('type', { type: new Set(['web', 'images', 'news']), default: 'web' })),
__param(5, (0, registry_1.Param)('q')),
__param(6, (0, registry_1.Param)('provider', { type: new Set(['google', 'bing']) })),
__param(7, (0, registry_1.Param)('num', { validate: (v) => v >= 0 && v <= 20 })),
__param(8, (0, registry_1.Param)('gl', { validate: (v) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) })),
__param(9, (0, registry_1.Param)('hl', { validate: (v) => serper_search_1.WORLD_LANGUAGES.some(l => l.code === v) })),
__param(10, (0, registry_1.Param)('location')),
__param(11, (0, registry_1.Param)('page')),
__param(12, (0, registry_1.Param)('fallback')),
__metadata("design:type", Function),
__metadata("design:paramtypes", [typeof (_d = typeof civ_rpc_1.RPCReflection !== "undefined" && civ_rpc_1.RPCReflection) === "function" ? _d : Object, typeof (_e = typeof registry_1.Context !== "undefined" && registry_1.Context) === "function" ? _e : Object, crawler_options_1.CrawlerOptions,
jina_embeddings_auth_1.JinaEmbeddingsAuthDTO, String, String, String, Number, String, String, String, Number, Boolean]),
__metadata("design:returntype", Promise)
], SerpHost.prototype, "search", null);
exports.SerpHost = SerpHost = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [logger_1.GlobalLogger, typeof (_a = typeof rate_limit_1.RateLimitControl !== "undefined" && rate_limit_1.RateLimitControl) === "function" ? _a : Object, async_context_1.AsyncLocalContext,
google_1.GoogleSERP, typeof (_b = typeof serper_1.SerperGoogleSearchService !== "undefined" && serper_1.SerperGoogleSearchService) === "function" ? _b : Object, typeof (_c = typeof serper_1.SerperBingSearchService !== "undefined" && serper_1.SerperBingSearchService) === "function" ? _c : Object, internal_1.InternalJinaSerpService])
], SerpHost);
//# sourceMappingURL=serp.js.map