Spaces:
Build error
Build error
saas: refactor proxy provider (#1186)
Browse files- .github/workflows/cd.yml +2 -2
- src/api/crawler.ts +14 -4
- src/services/serp/google.ts +2 -2
- thinapps-shared +1 -1
.github/workflows/cd.yml
CHANGED
|
@@ -84,6 +84,6 @@ jobs:
|
|
| 84 |
- name: Deploy SEARCH-EU with Tag
|
| 85 |
run: |
|
| 86 |
gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 87 |
-
- name: Deploy SERP-
|
| 88 |
run: |
|
| 89 |
-
gcloud beta run deploy serp-
|
|
|
|
| 84 |
- name: Deploy SEARCH-EU with Tag
|
| 85 |
run: |
|
| 86 |
gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 87 |
+
- name: Deploy SERP-HK with Tag
|
| 88 |
run: |
|
| 89 |
+
gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2
|
src/api/crawler.ts
CHANGED
|
@@ -41,13 +41,13 @@ import {
|
|
| 41 |
} from '../services/errors';
|
| 42 |
|
| 43 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 44 |
-
import {
|
| 45 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 46 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 47 |
import { RobotsTxtService } from '../services/robots-text';
|
| 48 |
import { TempFileManager } from '../services/temp-file';
|
| 49 |
import { MiscService } from '../services/misc';
|
| 50 |
-
import { HTTPServiceError } from 'civkit';
|
| 51 |
import { GeoIPService } from '../services/geoip';
|
| 52 |
|
| 53 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
|
@@ -87,7 +87,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 87 |
protected puppeteerControl: PuppeteerControl,
|
| 88 |
protected curlControl: CurlControl,
|
| 89 |
protected cfBrowserRendering: CFBrowserRendering,
|
| 90 |
-
protected proxyProvider:
|
| 91 |
protected lmControl: LmControl,
|
| 92 |
protected jsdomControl: JSDomControl,
|
| 93 |
protected snapshotFormatter: SnapshotFormatter,
|
|
@@ -1232,6 +1232,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 1232 |
};
|
| 1233 |
}
|
| 1234 |
|
|
|
|
| 1235 |
@retryWith((err) => {
|
| 1236 |
if (err instanceof ServiceBadApproachError) {
|
| 1237 |
return false;
|
|
@@ -1250,8 +1251,17 @@ export class CrawlerHost extends RPCHost {
|
|
| 1250 |
if (opts?.allocProxy === 'none') {
|
| 1251 |
return this.curlControl.sideLoad(url, opts);
|
| 1252 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1253 |
|
| 1254 |
-
|
| 1255 |
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
|
| 1256 |
const r = await this.curlControl.sideLoad(url, {
|
| 1257 |
...opts,
|
|
|
|
| 41 |
} from '../services/errors';
|
| 42 |
|
| 43 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 44 |
+
import { ProxyProviderService } from '../shared/services/proxy-provider';
|
| 45 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 46 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 47 |
import { RobotsTxtService } from '../services/robots-text';
|
| 48 |
import { TempFileManager } from '../services/temp-file';
|
| 49 |
import { MiscService } from '../services/misc';
|
| 50 |
+
import { HTTPServiceError } from 'civkit/http';
|
| 51 |
import { GeoIPService } from '../services/geoip';
|
| 52 |
|
| 53 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
|
|
|
| 87 |
protected puppeteerControl: PuppeteerControl,
|
| 88 |
protected curlControl: CurlControl,
|
| 89 |
protected cfBrowserRendering: CFBrowserRendering,
|
| 90 |
+
protected proxyProvider: ProxyProviderService,
|
| 91 |
protected lmControl: LmControl,
|
| 92 |
protected jsdomControl: JSDomControl,
|
| 93 |
protected snapshotFormatter: SnapshotFormatter,
|
|
|
|
| 1232 |
};
|
| 1233 |
}
|
| 1234 |
|
| 1235 |
+
proxyIterMap = new WeakMap<ExtraScrappingOptions, ReturnType<ProxyProviderService['iterAlloc']>>();
|
| 1236 |
@retryWith((err) => {
|
| 1237 |
if (err instanceof ServiceBadApproachError) {
|
| 1238 |
return false;
|
|
|
|
| 1251 |
if (opts?.allocProxy === 'none') {
|
| 1252 |
return this.curlControl.sideLoad(url, opts);
|
| 1253 |
}
|
| 1254 |
+
let proxy;
|
| 1255 |
+
if (opts) {
|
| 1256 |
+
let it = this.proxyIterMap.get(opts);
|
| 1257 |
+
if (!it) {
|
| 1258 |
+
it = this.proxyProvider.iterAlloc(this.figureOutBestProxyCountry(opts));
|
| 1259 |
+
this.proxyIterMap.set(opts, it);
|
| 1260 |
+
}
|
| 1261 |
+
proxy = (await it.next()).value;
|
| 1262 |
+
}
|
| 1263 |
|
| 1264 |
+
proxy ??= await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
|
| 1265 |
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
|
| 1266 |
const r = await this.curlControl.sideLoad(url, {
|
| 1267 |
...opts,
|
src/services/serp/google.ts
CHANGED
|
@@ -12,7 +12,7 @@ import { ApplicationError } from 'civkit/civ-rpc';
|
|
| 12 |
import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
|
| 13 |
import { parseJSONText } from 'civkit/vectorize';
|
| 14 |
import { retryWith } from 'civkit/decorators';
|
| 15 |
-
import {
|
| 16 |
|
| 17 |
@singleton()
|
| 18 |
export class GoogleSERP extends AsyncService {
|
|
@@ -24,7 +24,7 @@ export class GoogleSERP extends AsyncService {
|
|
| 24 |
protected puppeteerControl: SERPSpecializedPuppeteerControl,
|
| 25 |
protected jsDomControl: JSDomControl,
|
| 26 |
protected curlControl: CurlControl,
|
| 27 |
-
protected proxyProvider:
|
| 28 |
) {
|
| 29 |
const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
|
| 30 |
super(...filteredDeps);
|
|
|
|
| 12 |
import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
|
| 13 |
import { parseJSONText } from 'civkit/vectorize';
|
| 14 |
import { retryWith } from 'civkit/decorators';
|
| 15 |
+
import { ProxyProviderService } from '../../shared/services/proxy-provider';
|
| 16 |
|
| 17 |
@singleton()
|
| 18 |
export class GoogleSERP extends AsyncService {
|
|
|
|
| 24 |
protected puppeteerControl: SERPSpecializedPuppeteerControl,
|
| 25 |
protected jsDomControl: JSDomControl,
|
| 26 |
protected curlControl: CurlControl,
|
| 27 |
+
protected proxyProvider: ProxyProviderService,
|
| 28 |
) {
|
| 29 |
const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
|
| 30 |
super(...filteredDeps);
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 6fac86977536a7b7440edba8d4cf2a1f0e769e8c
|