nomagick commited on
Commit
fe26921
·
unverified ·
1 Parent(s): 75a4dbd

saas: refactor proxy provider (#1186)

Browse files
.github/workflows/cd.yml CHANGED
@@ -84,6 +84,6 @@ jobs:
84
  - name: Deploy SEARCH-EU with Tag
85
  run: |
86
  gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
87
- - name: Deploy SERP-JP with Tag
88
  run: |
89
- gcloud beta run deploy serp-jp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-northeast1 --async --min-instances 0 --deploy-health-check --use-http2
 
84
  - name: Deploy SEARCH-EU with Tag
85
  run: |
86
  gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
87
+ - name: Deploy SERP-HK with Tag
88
  run: |
89
+ gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2
src/api/crawler.ts CHANGED
@@ -41,13 +41,13 @@ import {
41
  } from '../services/errors';
42
 
43
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
44
- import { ProxyProvider } from '../shared/services/proxy-provider';
45
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
46
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
47
  import { RobotsTxtService } from '../services/robots-text';
48
  import { TempFileManager } from '../services/temp-file';
49
  import { MiscService } from '../services/misc';
50
- import { HTTPServiceError } from 'civkit';
51
  import { GeoIPService } from '../services/geoip';
52
 
53
  export interface ExtraScrappingOptions extends ScrappingOptions {
@@ -87,7 +87,7 @@ export class CrawlerHost extends RPCHost {
87
  protected puppeteerControl: PuppeteerControl,
88
  protected curlControl: CurlControl,
89
  protected cfBrowserRendering: CFBrowserRendering,
90
- protected proxyProvider: ProxyProvider,
91
  protected lmControl: LmControl,
92
  protected jsdomControl: JSDomControl,
93
  protected snapshotFormatter: SnapshotFormatter,
@@ -1232,6 +1232,7 @@ export class CrawlerHost extends RPCHost {
1232
  };
1233
  }
1234
 
 
1235
  @retryWith((err) => {
1236
  if (err instanceof ServiceBadApproachError) {
1237
  return false;
@@ -1250,8 +1251,17 @@ export class CrawlerHost extends RPCHost {
1250
  if (opts?.allocProxy === 'none') {
1251
  return this.curlControl.sideLoad(url, opts);
1252
  }
 
 
 
 
 
 
 
 
 
1253
 
1254
- const proxy = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
1255
  this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
1256
  const r = await this.curlControl.sideLoad(url, {
1257
  ...opts,
 
41
  } from '../services/errors';
42
 
43
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
44
+ import { ProxyProviderService } from '../shared/services/proxy-provider';
45
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
46
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
47
  import { RobotsTxtService } from '../services/robots-text';
48
  import { TempFileManager } from '../services/temp-file';
49
  import { MiscService } from '../services/misc';
50
+ import { HTTPServiceError } from 'civkit/http';
51
  import { GeoIPService } from '../services/geoip';
52
 
53
  export interface ExtraScrappingOptions extends ScrappingOptions {
 
87
  protected puppeteerControl: PuppeteerControl,
88
  protected curlControl: CurlControl,
89
  protected cfBrowserRendering: CFBrowserRendering,
90
+ protected proxyProvider: ProxyProviderService,
91
  protected lmControl: LmControl,
92
  protected jsdomControl: JSDomControl,
93
  protected snapshotFormatter: SnapshotFormatter,
 
1232
  };
1233
  }
1234
 
1235
+ proxyIterMap = new WeakMap<ExtraScrappingOptions, ReturnType<ProxyProviderService['iterAlloc']>>();
1236
  @retryWith((err) => {
1237
  if (err instanceof ServiceBadApproachError) {
1238
  return false;
 
1251
  if (opts?.allocProxy === 'none') {
1252
  return this.curlControl.sideLoad(url, opts);
1253
  }
1254
+ let proxy;
1255
+ if (opts) {
1256
+ let it = this.proxyIterMap.get(opts);
1257
+ if (!it) {
1258
+ it = this.proxyProvider.iterAlloc(this.figureOutBestProxyCountry(opts));
1259
+ this.proxyIterMap.set(opts, it);
1260
+ }
1261
+ proxy = (await it.next()).value;
1262
+ }
1263
 
1264
+ proxy ??= await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
1265
  this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
1266
  const r = await this.curlControl.sideLoad(url, {
1267
  ...opts,
src/services/serp/google.ts CHANGED
@@ -12,7 +12,7 @@ import { ApplicationError } from 'civkit/civ-rpc';
12
  import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
13
  import { parseJSONText } from 'civkit/vectorize';
14
  import { retryWith } from 'civkit/decorators';
15
- import { ProxyProvider } from '../../shared/services/proxy-provider';
16
 
17
  @singleton()
18
  export class GoogleSERP extends AsyncService {
@@ -24,7 +24,7 @@ export class GoogleSERP extends AsyncService {
24
  protected puppeteerControl: SERPSpecializedPuppeteerControl,
25
  protected jsDomControl: JSDomControl,
26
  protected curlControl: CurlControl,
27
- protected proxyProvider: ProxyProvider,
28
  ) {
29
  const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
30
  super(...filteredDeps);
 
12
  import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
13
  import { parseJSONText } from 'civkit/vectorize';
14
  import { retryWith } from 'civkit/decorators';
15
+ import { ProxyProviderService } from '../../shared/services/proxy-provider';
16
 
17
  @singleton()
18
  export class GoogleSERP extends AsyncService {
 
24
  protected puppeteerControl: SERPSpecializedPuppeteerControl,
25
  protected jsDomControl: JSDomControl,
26
  protected curlControl: CurlControl,
27
+ protected proxyProvider: ProxyProviderService,
28
  ) {
29
  const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
30
  super(...filteredDeps);
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 424f50ca8b6277d74185e16aa67ff2b366d9f727
 
1
+ Subproject commit 6fac86977536a7b7440edba8d4cf2a1f0e769e8c