nomagick commited on
Commit
9bcde30
·
unverified ·
1 Parent(s): 09dbbd3
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -776,7 +776,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
776
 
777
  getUrlDigest(urlToCrawl: URL) {
778
  const normalizedURL = new URL(urlToCrawl);
779
- normalizedURL.hash = '';
 
 
780
  const normalizedUrl = normalizedURL.toString().toLowerCase();
781
  const digest = md5Hasher.hash(normalizedUrl.toString());
782
 
 
776
 
777
  getUrlDigest(urlToCrawl: URL) {
778
  const normalizedURL = new URL(urlToCrawl);
779
+ if (!normalizedURL.hash.startsWith('/')) {
780
+ normalizedURL.hash = '';
781
+ }
782
  const normalizedUrl = normalizedURL.toString().toLowerCase();
783
  const digest = md5Hasher.hash(normalizedUrl.toString());
784
 
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -5,7 +5,7 @@ import {
5
  objHashMd5B64Of,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
- import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { Request, Response } from 'express';
@@ -83,6 +83,8 @@ export class SearcherHost extends RPCHost {
83
  res: Response,
84
  },
85
  auth: JinaEmbeddingsAuthDTO,
 
 
86
  crawlerOptions: CrawlerOptions,
87
  braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
88
  ) {
@@ -157,7 +159,7 @@ export class SearcherHost extends RPCHost {
157
  const searchQuery = braveSearchExplicitOperators.addTo(ctx.req.path.slice(1));
158
  const r = await this.cachedWebSearch({
159
  q: searchQuery,
160
- count: 10
161
  }, crawlerOptions.noCache);
162
 
163
  if (!r.web?.results.length) {
@@ -226,7 +228,7 @@ export class SearcherHost extends RPCHost {
226
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
227
  setEarlyReturnTimer();
228
  }
229
- if (!this.searchResultsQualified(scrapped)) {
230
  continue;
231
  }
232
  if (earlyReturnTimer) {
@@ -274,7 +276,7 @@ export class SearcherHost extends RPCHost {
274
  setEarlyReturnTimer();
275
  }
276
 
277
- if (!this.searchResultsQualified(scrapped)) {
278
  continue;
279
  }
280
 
@@ -425,8 +427,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
425
  formattedPage.html;
426
  }
427
 
428
- searchResultsQualified(results: FormattedPage[]) {
429
- return _.every(results, (x) => this.pageQualified(x)) && results.length >= this.targetResultCount;
430
  }
431
 
432
  async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
 
5
  objHashMd5B64Of,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
+ import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
9
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { Request, Response } from 'express';
 
83
  res: Response,
84
  },
85
  auth: JinaEmbeddingsAuthDTO,
86
+ @Param('count', { default: 5, validate: (v) => v >= 3 && v <= 10 })
87
+ count: number,
88
  crawlerOptions: CrawlerOptions,
89
  braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
90
  ) {
 
159
  const searchQuery = braveSearchExplicitOperators.addTo(ctx.req.path.slice(1));
160
  const r = await this.cachedWebSearch({
161
  q: searchQuery,
162
+ count: Math.floor(count * 2)
163
  }, crawlerOptions.noCache);
164
 
165
  if (!r.web?.results.length) {
 
228
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
229
  setEarlyReturnTimer();
230
  }
231
+ if (!this.searchResultsQualified(scrapped, count)) {
232
  continue;
233
  }
234
  if (earlyReturnTimer) {
 
276
  setEarlyReturnTimer();
277
  }
278
 
279
+ if (!this.searchResultsQualified(scrapped, count)) {
280
  continue;
281
  }
282
 
 
427
  formattedPage.html;
428
  }
429
 
430
+ searchResultsQualified(results: FormattedPage[], targetResultCount = this.targetResultCount) {
431
+ return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
432
  }
433
 
434
  async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
backend/functions/src/services/brave-search.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, marshalErrorLike, retry } from 'civkit';
2
  import { singleton } from 'tsyringe';
3
  import { Logger } from '../shared/services/logger';
4
  import { SecretExposer } from '../shared/services/secrets';
@@ -31,7 +31,6 @@ export class BraveSearchService extends AsyncService {
31
  this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
32
  }
33
 
34
- @retry(3, Math.ceil(500 + 500 * Math.random()))
35
  async webSearch(query: WebSearchQueryParams) {
36
  const ip = this.threadLocal.get('ip');
37
  const extraHeaders: WebSearchOptionalHeaderOptions = {};
@@ -65,16 +64,25 @@ export class BraveSearchService extends AsyncService {
65
  encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q);
66
  }
67
 
68
- try {
69
- const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
70
 
71
- return r.parsed;
72
- } catch (err: any) {
73
- this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) });
74
 
75
- throw new DownstreamServiceFailureError({ message: `Search failed` });
 
 
 
 
 
 
 
 
 
76
  }
77
 
 
78
  }
79
 
80
  }
 
1
+ import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
2
  import { singleton } from 'tsyringe';
3
  import { Logger } from '../shared/services/logger';
4
  import { SecretExposer } from '../shared/services/secrets';
 
31
  this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
32
  }
33
 
 
34
  async webSearch(query: WebSearchQueryParams) {
35
  const ip = this.threadLocal.get('ip');
36
  const extraHeaders: WebSearchOptionalHeaderOptions = {};
 
64
  encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q);
65
  }
66
 
67
+ let maxTries = 11;
 
68
 
69
+ while (maxTries--) {
70
+ try {
71
+ const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
72
 
73
+ return r.parsed;
74
+ } catch (err: any) {
75
+ this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) });
76
+ if (err?.status === 429) {
77
+ await delay(500 + 1000 * Math.random());
78
+ continue;
79
+ }
80
+
81
+ throw new DownstreamServiceFailureError({ message: `Search failed` });
82
+ }
83
  }
84
 
85
+ throw new DownstreamServiceFailureError({ message: `Search failed` });
86
  }
87
 
88
  }