nomagick commited on
Commit
1e5e94f
·
unverified ·
1 Parent(s): 0cf8371

saas(search): switch to internal serp

Browse files
src/api/{searcher-serper.ts → searcher.ts} RENAMED
@@ -9,10 +9,9 @@ import _ from 'lodash';
9
  import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit';
10
 
11
  import { CrawlerHost, ExtraScrappingOptions } from './crawler';
12
- import { SerperSearchResult } from '../db/searched';
13
  import { CrawlerOptions, RESPOND_TIMING } from '../dto/crawler-options';
14
  import { SnapshotFormatter, FormattedPage as RealFormattedPage } from '../services/snapshot-formatter';
15
- import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
16
 
17
  import { GlobalLogger } from '../services/logger';
18
  import { AsyncLocalContext } from '../services/async-context';
@@ -20,19 +19,16 @@ import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
20
  import { OutputServerEventStream } from '../lib/transform-server-event-stream';
21
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
22
  import { InsufficientBalanceError } from '../services/errors';
23
- import {
24
- SerperImageSearchResponse,
25
- SerperNewsSearchResponse,
26
- SerperSearchQueryParams,
27
- SerperSearchResponse,
28
- SerperWebSearchResponse,
29
- WORLD_COUNTRIES,
30
- WORLD_LANGUAGES
31
- } from '../shared/3rd-party/serper-search';
32
  import { toAsyncGenerator } from '../utils/misc';
33
  import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
34
  import { LRUCache } from 'lru-cache';
35
  import { API_CALL_STATUS } from '../shared/db/api-roll';
 
 
 
 
36
 
37
  const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
38
 
@@ -69,9 +65,11 @@ export class SearcherHost extends RPCHost {
69
  protected globalLogger: GlobalLogger,
70
  protected rateLimitControl: RateLimitControl,
71
  protected threadLocal: AsyncLocalContext,
72
- protected serperSearchService: SerperSearchService,
73
  protected crawler: CrawlerHost,
74
  protected snapshotFormatter: SnapshotFormatter,
 
 
 
75
  ) {
76
  super(...arguments);
77
  }
@@ -318,9 +316,14 @@ export class SearcherHost extends RPCHost {
318
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
319
  }
320
 
 
 
 
 
 
321
  let lastScrapped: any[] | undefined;
322
  const targetResultCount = crawlWithoutContent ? count : count + 2;
323
- const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
324
  trimmedResults.toString = function () {
325
  let r = this.map((x, i) => x ? Reflect.apply(x.toString, x, [i]) : '').join('\n\n').trimEnd() + '\n';
326
  if (fallbackQuery) {
@@ -521,7 +524,7 @@ export class SearcherHost extends RPCHost {
521
 
522
  // Extract results based on variant
523
  let tryTimes = 1;
524
- const results = await this.doSearch(params, noCache);
525
  if (results.length || !useFallback) {
526
  return { results, query: params.q, tryTimes };
527
  }
@@ -545,7 +548,7 @@ export class SearcherHost extends RPCHost {
545
  tryTimes += 1;
546
  this.logger.info(`Retrying search with fallback query: "${query}"`);
547
  const fallbackParams = { ...params, q: query };
548
- const fallbackResults = await this.doSearch(fallbackParams, noCache);
549
  if (fallbackResults.length > 0) {
550
  return { results: fallbackResults, query: fallbackParams.q, tryTimes };
551
  }
@@ -556,7 +559,7 @@ export class SearcherHost extends RPCHost {
556
  this.logger.info(`Retrying search with fallback query: "${query}"`);
557
  const fallbackParams = { ...params, q: query };
558
  tryTimes += 1;
559
- const fallbackResults = await this.doSearch(fallbackParams, noCache);
560
 
561
  if (fallbackResults.length > 0) {
562
  return { results: fallbackResults, query, tryTimes };
@@ -566,22 +569,6 @@ export class SearcherHost extends RPCHost {
566
  return { results, query: originalQuery, tryTimes };
567
  }
568
 
569
- async doSearch(
570
- params: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; },
571
- noCache: boolean = false,
572
- ) {
573
- const response = await this.cachedSearch(params, noCache);
574
-
575
- let results = [];
576
- switch (params.variant) {
577
- case 'images': results = (response as SerperImageSearchResponse).images; break;
578
- case 'news': results = (response as SerperNewsSearchResponse).news; break;
579
- case 'web': default: results = (response as SerperWebSearchResponse).organic; break;
580
- }
581
-
582
- return results;
583
- }
584
-
585
  async *fetchSearchResults(
586
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
587
  searchResults?: FormattedPage[],
@@ -706,13 +693,36 @@ export class SearcherHost extends RPCHost {
706
  }
707
  }
708
 
709
- async cachedSearch(query: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; }, noCache: boolean = false) {
710
- const queryDigest = objHashMd5B64Of(query);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  Reflect.deleteProperty(query, 'provider');
712
  let cache;
713
  if (!noCache) {
714
- cache = (await SerperSearchResult.fromFirestoreQuery(
715
- SerperSearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
716
  .orderBy('createdAt', 'desc')
717
  .limit(1)
718
  ))[0];
@@ -724,70 +734,81 @@ export class SearcherHost extends RPCHost {
724
  });
725
 
726
  if (!stale) {
727
- return cache.response as SerperSearchResponse;
728
  }
729
  }
730
  }
731
 
732
  try {
733
- let r;
734
- const variant = query.variant;
735
- Reflect.deleteProperty(query, 'variant');
736
- switch (variant) {
737
- case 'images': {
738
- r = await this.serperSearchService.imageSearch(query);
739
- break;
740
- }
741
- case 'news': {
742
- r = await this.serperSearchService.newsSearch(query);
743
- break;
744
- }
745
- case 'web':
746
- default: {
747
- r = await this.serperSearchService.webSearch(query);
748
- break;
 
 
 
 
 
 
 
 
 
 
 
 
749
  }
750
  }
751
 
752
- const nowDate = new Date();
753
- const record = SerperSearchResult.from({
754
- query,
755
- queryDigest,
756
- response: r,
757
- createdAt: nowDate,
758
- expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
759
- });
760
- SerperSearchResult.save(record.degradeForFireStore()).catch((err) => {
761
- this.logger.warn(`Failed to cache search result`, { err });
762
- });
 
 
 
 
763
 
764
- return r;
765
  } catch (err: any) {
766
  if (cache) {
767
  this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });
768
 
769
- return cache.response as SerperSearchResponse;
770
  }
771
 
772
  throw err;
773
  }
774
-
775
  }
776
 
777
- mapToFinalResults(input:
778
- | SerperImageSearchResponse['images'][0]
779
- | SerperWebSearchResponse['organic'][0]
780
- | SerperNewsSearchResponse['news'][0],
781
- ) {
782
  const whitelistedProps = [
783
- 'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date'
784
  ];
785
  const result = {
786
  title: input.title,
787
  url: input.link,
788
  description: Reflect.get(input, 'snippet'),
789
  ..._.pick(input, whitelistedProps),
790
- } as FormattedPage;
791
 
792
  return result;
793
  }
 
9
  import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit';
10
 
11
  import { CrawlerHost, ExtraScrappingOptions } from './crawler';
 
12
  import { CrawlerOptions, RESPOND_TIMING } from '../dto/crawler-options';
13
  import { SnapshotFormatter, FormattedPage as RealFormattedPage } from '../services/snapshot-formatter';
14
+ import { GoogleSearchExplicitOperatorsDto } from '../services/serper-search';
15
 
16
  import { GlobalLogger } from '../services/logger';
17
  import { AsyncLocalContext } from '../services/async-context';
 
19
  import { OutputServerEventStream } from '../lib/transform-server-event-stream';
20
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
21
  import { InsufficientBalanceError } from '../services/errors';
22
+
23
+ import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper';
 
 
 
 
 
 
 
24
  import { toAsyncGenerator } from '../utils/misc';
25
  import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
26
  import { LRUCache } from 'lru-cache';
27
  import { API_CALL_STATUS } from '../shared/db/api-roll';
28
+ import { SERPResult } from '../db/searched';
29
+ import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
30
+ import { InternalJinaSerpService } from '../services/serp/internal';
31
+ import { WebSearchEntry } from '../services/serp/compat';
32
 
33
  const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
34
 
 
65
  protected globalLogger: GlobalLogger,
66
  protected rateLimitControl: RateLimitControl,
67
  protected threadLocal: AsyncLocalContext,
 
68
  protected crawler: CrawlerHost,
69
  protected snapshotFormatter: SnapshotFormatter,
70
+ protected serperGoogle: SerperGoogleSearchService,
71
+ protected serperBing: SerperBingSearchService,
72
+ protected jinaSerp: InternalJinaSerpService,
73
  ) {
74
  super(...arguments);
75
  }
 
316
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
317
  }
318
 
319
+ if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
320
+ delete crawlOpts.timeoutMs;
321
+ }
322
+
323
+
324
  let lastScrapped: any[] | undefined;
325
  const targetResultCount = crawlWithoutContent ? count : count + 2;
326
+ const trimmedResults: any[] = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
327
  trimmedResults.toString = function () {
328
  let r = this.map((x, i) => x ? Reflect.apply(x.toString, x, [i]) : '').join('\n\n').trimEnd() + '\n';
329
  if (fallbackQuery) {
 
524
 
525
  // Extract results based on variant
526
  let tryTimes = 1;
527
+ const results = await this.cachedSearch(params.variant, params, noCache);
528
  if (results.length || !useFallback) {
529
  return { results, query: params.q, tryTimes };
530
  }
 
548
  tryTimes += 1;
549
  this.logger.info(`Retrying search with fallback query: "${query}"`);
550
  const fallbackParams = { ...params, q: query };
551
+ const fallbackResults = await this.cachedSearch(params.variant, fallbackParams, noCache);
552
  if (fallbackResults.length > 0) {
553
  return { results: fallbackResults, query: fallbackParams.q, tryTimes };
554
  }
 
559
  this.logger.info(`Retrying search with fallback query: "${query}"`);
560
  const fallbackParams = { ...params, q: query };
561
  tryTimes += 1;
562
+ const fallbackResults = await this.cachedSearch(params.variant, fallbackParams, noCache);
563
 
564
  if (fallbackResults.length > 0) {
565
  return { results: fallbackResults, query, tryTimes };
 
569
  return { results, query: originalQuery, tryTimes };
570
  }
571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  async *fetchSearchResults(
573
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
574
  searchResults?: FormattedPage[],
 
693
  }
694
  }
695
 
696
+ *iterProviders(preference?: string) {
697
+ if (preference === 'bing') {
698
+ yield this.serperBing;
699
+ yield this.jinaSerp;
700
+ yield this.serperGoogle;
701
+
702
+ return;
703
+ }
704
+
705
+ if (preference === 'google') {
706
+ yield this.jinaSerp;
707
+ yield this.serperGoogle;
708
+ yield this.serperGoogle;
709
+
710
+ return;
711
+ }
712
+
713
+ yield this.jinaSerp;
714
+ yield this.serperGoogle;
715
+ yield this.serperGoogle;
716
+ }
717
+
718
+ async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, noCache?: boolean): Promise<WebSearchEntry[]> {
719
+ const queryDigest = objHashMd5B64Of({ ...query, variant });
720
+ const provider = query.provider;
721
  Reflect.deleteProperty(query, 'provider');
722
  let cache;
723
  if (!noCache) {
724
+ cache = (await SERPResult.fromFirestoreQuery(
725
+ SERPResult.COLLECTION.where('queryDigest', '==', queryDigest)
726
  .orderBy('createdAt', 'desc')
727
  .limit(1)
728
  ))[0];
 
734
  });
735
 
736
  if (!stale) {
737
+ return cache.response as any;
738
  }
739
  }
740
  }
741
 
742
  try {
743
+ let r: any[] | undefined;
744
+ let lastError;
745
+ outerLoop:
746
+ for (const client of this.iterProviders(provider)) {
747
+ const t0 = Date.now();
748
+ try {
749
+ switch (variant) {
750
+ case 'images': {
751
+ r = await Reflect.apply(client.imageSearch, client, [query]);
752
+ break;
753
+ }
754
+ case 'news': {
755
+ r = await Reflect.apply(client.newsSearch, client, [query]);
756
+ break;
757
+ }
758
+ case 'web':
759
+ default: {
760
+ r = await Reflect.apply(client.webSearch, client, [query]);
761
+ break;
762
+ }
763
+ }
764
+ const dt = Date.now() - t0;
765
+ this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
766
+ break outerLoop;
767
+ } catch (err) {
768
+ lastError = err;
769
+ const dt = Date.now() - t0;
770
+ this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, });
771
  }
772
  }
773
 
774
+ if (r?.length) {
775
+ const nowDate = new Date();
776
+ const record = SERPResult.from({
777
+ query,
778
+ queryDigest,
779
+ response: r,
780
+ createdAt: nowDate,
781
+ expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
782
+ });
783
+ SERPResult.save(record.degradeForFireStore()).catch((err) => {
784
+ this.logger.warn(`Failed to cache search result`, { err });
785
+ });
786
+ } else if (lastError) {
787
+ throw lastError;
788
+ }
789
 
790
+ return r as WebSearchEntry[];
791
  } catch (err: any) {
792
  if (cache) {
793
  this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });
794
 
795
+ return cache.response as any;
796
  }
797
 
798
  throw err;
799
  }
 
800
  }
801
 
802
+ mapToFinalResults(input: WebSearchEntry) {
 
 
 
 
803
  const whitelistedProps = [
804
+ 'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks'
805
  ];
806
  const result = {
807
  title: input.title,
808
  url: input.link,
809
  description: Reflect.get(input, 'snippet'),
810
  ..._.pick(input, whitelistedProps),
811
+ };
812
 
813
  return result;
814
  }
src/services/serp/internal.ts ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import { singleton } from 'tsyringe';
3
+ import { GlobalLogger } from '../logger';
4
+ import { SecretExposer } from '../../shared/services/secrets';
5
+ import { AsyncLocalContext } from '../async-context';
6
+ import { SerperSearchQueryParams } from '../../shared/3rd-party/serper-search';
7
+ import { BlackHoleDetector } from '../blackhole-detector';
8
+ import { AsyncService } from 'civkit/async-service';
9
+ import { JinaSerpApiHTTP } from '../../shared/3rd-party/internal-serp';
10
+ import { WebSearchEntry } from './compat';
11
+
12
+ @singleton()
13
+ export class InternalJinaSerpService extends AsyncService {
14
+
15
+ logger = this.globalLogger.child({ service: this.constructor.name });
16
+
17
+ client!: JinaSerpApiHTTP;
18
+
19
+ constructor(
20
+ protected globalLogger: GlobalLogger,
21
+ protected secretExposer: SecretExposer,
22
+ protected threadLocal: AsyncLocalContext,
23
+ protected blackHoleDetector: BlackHoleDetector,
24
+ ) {
25
+ super(...arguments);
26
+ }
27
+
28
+ override async init() {
29
+ await this.dependencyReady();
30
+ this.emit('ready');
31
+
32
+ this.client = new JinaSerpApiHTTP(this.secretExposer.JINA_SERP_API_KEY);
33
+ }
34
+
35
+
36
+ async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) {
37
+ this.logger.debug(`Doing external search`, query);
38
+ let results;
39
+ switch (variant) {
40
+ // case 'images': {
41
+ // const r = await this.client.imageSearch(query);
42
+
43
+ // results = r.parsed.images;
44
+ // break;
45
+ // }
46
+ // case 'news': {
47
+ // const r = await this.client.newsSearch(query);
48
+
49
+ // results = r.parsed.news;
50
+ // break;
51
+ // }
52
+ case 'web':
53
+ default: {
54
+ const r = await this.client.webSearch(query);
55
+
56
+ results = r.parsed.results?.map((x) => ({ ...x, link: x.url }));
57
+ break;
58
+ }
59
+ }
60
+
61
+ this.blackHoleDetector.itWorked();
62
+
63
+ return results as WebSearchEntry[];
64
+ }
65
+
66
+
67
+ async webSearch(query: SerperSearchQueryParams) {
68
+ return this.doSearch('web', query);
69
+ }
70
+ async imageSearch(query: SerperSearchQueryParams) {
71
+ return this.doSearch('images', query);
72
+ }
73
+ async newsSearch(query: SerperSearchQueryParams) {
74
+ return this.doSearch('news', query);
75
+ }
76
+
77
+ }
src/stand-alone/search.ts CHANGED
@@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe';
4
  import { KoaServer } from 'civkit/civ-rpc/koa';
5
  import http2 from 'http2';
6
  import http from 'http';
7
- import { SearcherHost } from '../api/searcher-serper';
8
  import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
9
  import path from 'path';
10
  import fs from 'fs';
 
4
  import { KoaServer } from 'civkit/civ-rpc/koa';
5
  import http2 from 'http2';
6
  import http from 'http';
7
+ import { SearcherHost } from '../api/searcher';
8
  import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
9
  import path from 'path';
10
  import fs from 'fs';
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 08ded7b8eceee7e931d52e77c87103f28c3ba9e8
 
1
+ Subproject commit 3238f911b51c28960d94d3683076e48c17a57610