Spaces:
Build error
Build error
fix
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -776,7 +776,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 776 |
|
| 777 |
getUrlDigest(urlToCrawl: URL) {
|
| 778 |
const normalizedURL = new URL(urlToCrawl);
|
| 779 |
-
normalizedURL.hash
|
|
|
|
|
|
|
| 780 |
const normalizedUrl = normalizedURL.toString().toLowerCase();
|
| 781 |
const digest = md5Hasher.hash(normalizedUrl.toString());
|
| 782 |
|
|
|
|
| 776 |
|
| 777 |
getUrlDigest(urlToCrawl: URL) {
|
| 778 |
const normalizedURL = new URL(urlToCrawl);
|
| 779 |
+
if (!normalizedURL.hash.startsWith('/')) {
|
| 780 |
+
normalizedURL.hash = '';
|
| 781 |
+
}
|
| 782 |
const normalizedUrl = normalizedURL.toString().toLowerCase();
|
| 783 |
const digest = md5Hasher.hash(normalizedUrl.toString());
|
| 784 |
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
| 5 |
objHashMd5B64Of,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
-
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { Request, Response } from 'express';
|
|
@@ -83,6 +83,8 @@ export class SearcherHost extends RPCHost {
|
|
| 83 |
res: Response,
|
| 84 |
},
|
| 85 |
auth: JinaEmbeddingsAuthDTO,
|
|
|
|
|
|
|
| 86 |
crawlerOptions: CrawlerOptions,
|
| 87 |
braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
|
| 88 |
) {
|
|
@@ -157,7 +159,7 @@ export class SearcherHost extends RPCHost {
|
|
| 157 |
const searchQuery = braveSearchExplicitOperators.addTo(ctx.req.path.slice(1));
|
| 158 |
const r = await this.cachedWebSearch({
|
| 159 |
q: searchQuery,
|
| 160 |
-
count:
|
| 161 |
}, crawlerOptions.noCache);
|
| 162 |
|
| 163 |
if (!r.web?.results.length) {
|
|
@@ -226,7 +228,7 @@ export class SearcherHost extends RPCHost {
|
|
| 226 |
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 227 |
setEarlyReturnTimer();
|
| 228 |
}
|
| 229 |
-
if (!this.searchResultsQualified(scrapped)) {
|
| 230 |
continue;
|
| 231 |
}
|
| 232 |
if (earlyReturnTimer) {
|
|
@@ -274,7 +276,7 @@ export class SearcherHost extends RPCHost {
|
|
| 274 |
setEarlyReturnTimer();
|
| 275 |
}
|
| 276 |
|
| 277 |
-
if (!this.searchResultsQualified(scrapped)) {
|
| 278 |
continue;
|
| 279 |
}
|
| 280 |
|
|
@@ -425,8 +427,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|
| 425 |
formattedPage.html;
|
| 426 |
}
|
| 427 |
|
| 428 |
-
searchResultsQualified(results: FormattedPage[]) {
|
| 429 |
-
return _.every(results, (x) => this.pageQualified(x)) && results.length >=
|
| 430 |
}
|
| 431 |
|
| 432 |
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
|
|
|
| 5 |
objHashMd5B64Of,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
+
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { Request, Response } from 'express';
|
|
|
|
| 83 |
res: Response,
|
| 84 |
},
|
| 85 |
auth: JinaEmbeddingsAuthDTO,
|
| 86 |
+
@Param('count', { default: 5, validate: (v) => v >= 3 && v <= 10 })
|
| 87 |
+
count: number,
|
| 88 |
crawlerOptions: CrawlerOptions,
|
| 89 |
braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
|
| 90 |
) {
|
|
|
|
| 159 |
const searchQuery = braveSearchExplicitOperators.addTo(ctx.req.path.slice(1));
|
| 160 |
const r = await this.cachedWebSearch({
|
| 161 |
q: searchQuery,
|
| 162 |
+
count: Math.floor(count * 2)
|
| 163 |
}, crawlerOptions.noCache);
|
| 164 |
|
| 165 |
if (!r.web?.results.length) {
|
|
|
|
| 228 |
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 229 |
setEarlyReturnTimer();
|
| 230 |
}
|
| 231 |
+
if (!this.searchResultsQualified(scrapped, count)) {
|
| 232 |
continue;
|
| 233 |
}
|
| 234 |
if (earlyReturnTimer) {
|
|
|
|
| 276 |
setEarlyReturnTimer();
|
| 277 |
}
|
| 278 |
|
| 279 |
+
if (!this.searchResultsQualified(scrapped, count)) {
|
| 280 |
continue;
|
| 281 |
}
|
| 282 |
|
|
|
|
| 427 |
formattedPage.html;
|
| 428 |
}
|
| 429 |
|
| 430 |
+
searchResultsQualified(results: FormattedPage[], targetResultCount = this.targetResultCount) {
|
| 431 |
+
return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
|
| 432 |
}
|
| 433 |
|
| 434 |
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
backend/functions/src/services/brave-search.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT,
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
import { Logger } from '../shared/services/logger';
|
| 4 |
import { SecretExposer } from '../shared/services/secrets';
|
|
@@ -31,7 +31,6 @@ export class BraveSearchService extends AsyncService {
|
|
| 31 |
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
|
| 32 |
}
|
| 33 |
|
| 34 |
-
@retry(3, Math.ceil(500 + 500 * Math.random()))
|
| 35 |
async webSearch(query: WebSearchQueryParams) {
|
| 36 |
const ip = this.threadLocal.get('ip');
|
| 37 |
const extraHeaders: WebSearchOptionalHeaderOptions = {};
|
|
@@ -65,16 +64,25 @@ export class BraveSearchService extends AsyncService {
|
|
| 65 |
encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q);
|
| 66 |
}
|
| 67 |
|
| 68 |
-
|
| 69 |
-
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
}
|
| 77 |
|
|
|
|
| 78 |
}
|
| 79 |
|
| 80 |
}
|
|
|
|
| 1 |
+
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
import { Logger } from '../shared/services/logger';
|
| 4 |
import { SecretExposer } from '../shared/services/secrets';
|
|
|
|
| 31 |
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
|
| 32 |
}
|
| 33 |
|
|
|
|
| 34 |
async webSearch(query: WebSearchQueryParams) {
|
| 35 |
const ip = this.threadLocal.get('ip');
|
| 36 |
const extraHeaders: WebSearchOptionalHeaderOptions = {};
|
|
|
|
| 64 |
encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q);
|
| 65 |
}
|
| 66 |
|
| 67 |
+
let maxTries = 11;
|
|
|
|
| 68 |
|
| 69 |
+
while (maxTries--) {
|
| 70 |
+
try {
|
| 71 |
+
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
|
| 72 |
|
| 73 |
+
return r.parsed;
|
| 74 |
+
} catch (err: any) {
|
| 75 |
+
this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) });
|
| 76 |
+
if (err?.status === 429) {
|
| 77 |
+
await delay(500 + 1000 * Math.random());
|
| 78 |
+
continue;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
throw new DownstreamServiceFailureError({ message: `Search failed` });
|
| 82 |
+
}
|
| 83 |
}
|
| 84 |
|
| 85 |
+
throw new DownstreamServiceFailureError({ message: `Search failed` });
|
| 86 |
}
|
| 87 |
|
| 88 |
}
|