Spaces:
Build error
Build error
feat: web search (#57)
Browse files- backend/.gitignore +2 -1
- backend/functions/integrity-check.cjs +11 -0
- backend/functions/package-lock.json +31 -0
- backend/functions/package.json +2 -1
- backend/functions/src/cloud-functions/crawler.ts +54 -8
- backend/functions/src/cloud-functions/searcher.ts +389 -0
- backend/functions/src/db/searched.ts +60 -0
- backend/functions/src/services/brave-search.ts +71 -0
- backend/functions/src/services/geoip.ts +123 -0
- backend/functions/src/services/puppeteer.ts +4 -4
- thinapps-shared +1 -1
backend/.gitignore
CHANGED
|
@@ -75,4 +75,5 @@ build/
|
|
| 75 |
.DS_Store
|
| 76 |
|
| 77 |
*.local
|
| 78 |
-
.secret.*
|
|
|
|
|
|
| 75 |
.DS_Store
|
| 76 |
|
| 77 |
*.local
|
| 78 |
+
.secret.*
|
| 79 |
+
licensed/
|
backend/functions/integrity-check.cjs
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
const fs = require('fs');
|
| 4 |
+
const path = require('path');
|
| 5 |
+
|
| 6 |
+
const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');
|
| 7 |
+
|
| 8 |
+
if (!fs.existsSync(file)) {
|
| 9 |
+
console.error(`Integrity check failed: ${file} does not exist.`);
|
| 10 |
+
process.exit(1);
|
| 11 |
+
}
|
backend/functions/package-lock.json
CHANGED
|
@@ -24,6 +24,7 @@
|
|
| 24 |
"htmlparser2": "^9.0.0",
|
| 25 |
"jose": "^5.1.0",
|
| 26 |
"langdetect": "^0.2.1",
|
|
|
|
| 27 |
"minio": "^7.1.3",
|
| 28 |
"openai": "^4.20.0",
|
| 29 |
"puppeteer": "^22.7.1",
|
|
@@ -8144,6 +8145,19 @@
|
|
| 8144 |
"tmpl": "1.0.5"
|
| 8145 |
}
|
| 8146 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8147 |
"node_modules/media-typer": {
|
| 8148 |
"version": "0.3.0",
|
| 8149 |
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
|
@@ -8375,6 +8389,15 @@
|
|
| 8375 |
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
|
| 8376 |
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
|
| 8377 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8378 |
"node_modules/mongodb": {
|
| 8379 |
"version": "5.9.2",
|
| 8380 |
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
|
|
@@ -11059,6 +11082,14 @@
|
|
| 11059 |
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
|
| 11060 |
"integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
|
| 11061 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11062 |
"node_modules/tld-extract": {
|
| 11063 |
"version": "2.1.0",
|
| 11064 |
"resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",
|
|
|
|
| 24 |
"htmlparser2": "^9.0.0",
|
| 25 |
"jose": "^5.1.0",
|
| 26 |
"langdetect": "^0.2.1",
|
| 27 |
+
"maxmind": "^4.3.18",
|
| 28 |
"minio": "^7.1.3",
|
| 29 |
"openai": "^4.20.0",
|
| 30 |
"puppeteer": "^22.7.1",
|
|
|
|
| 8145 |
"tmpl": "1.0.5"
|
| 8146 |
}
|
| 8147 |
},
|
| 8148 |
+
"node_modules/maxmind": {
|
| 8149 |
+
"version": "4.3.18",
|
| 8150 |
+
"resolved": "https://registry.npmjs.org/maxmind/-/maxmind-4.3.18.tgz",
|
| 8151 |
+
"integrity": "sha512-5b9utU7ZxcGYTBaO7hCF0FXyfw3IpankLn+FnLW4RZS1zi97RBeSdfXJFJlk5UxNsMiFZlsdMT3lzvD+bD8MLQ==",
|
| 8152 |
+
"dependencies": {
|
| 8153 |
+
"mmdb-lib": "2.1.0",
|
| 8154 |
+
"tiny-lru": "11.2.5"
|
| 8155 |
+
},
|
| 8156 |
+
"engines": {
|
| 8157 |
+
"node": ">=12",
|
| 8158 |
+
"npm": ">=6"
|
| 8159 |
+
}
|
| 8160 |
+
},
|
| 8161 |
"node_modules/media-typer": {
|
| 8162 |
"version": "0.3.0",
|
| 8163 |
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
|
|
|
| 8389 |
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
|
| 8390 |
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
|
| 8391 |
},
|
| 8392 |
+
"node_modules/mmdb-lib": {
|
| 8393 |
+
"version": "2.1.0",
|
| 8394 |
+
"resolved": "https://registry.npmjs.org/mmdb-lib/-/mmdb-lib-2.1.0.tgz",
|
| 8395 |
+
"integrity": "sha512-tdDTZmnI5G4UoSctv2KxM/3VQt2XRj4CmR5R4VsAWsOUcS3LysHR34wtixWm/pXxXdkBDuN92auxkC0T2+qd1Q==",
|
| 8396 |
+
"engines": {
|
| 8397 |
+
"node": ">=10",
|
| 8398 |
+
"npm": ">=6"
|
| 8399 |
+
}
|
| 8400 |
+
},
|
| 8401 |
"node_modules/mongodb": {
|
| 8402 |
"version": "5.9.2",
|
| 8403 |
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
|
|
|
|
| 11082 |
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
|
| 11083 |
"integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
|
| 11084 |
},
|
| 11085 |
+
"node_modules/tiny-lru": {
|
| 11086 |
+
"version": "11.2.5",
|
| 11087 |
+
"resolved": "https://registry.npmjs.org/tiny-lru/-/tiny-lru-11.2.5.tgz",
|
| 11088 |
+
"integrity": "sha512-JpqM0K33lG6iQGKiigcwuURAKZlq6rHXfrgeL4/I8/REoyJTGU+tEMszvT/oTRVHG2OiylhGDjqPp1jWMlr3bw==",
|
| 11089 |
+
"engines": {
|
| 11090 |
+
"node": ">=12"
|
| 11091 |
+
}
|
| 11092 |
+
},
|
| 11093 |
"node_modules/tld-extract": {
|
| 11094 |
"version": "2.1.0",
|
| 11095 |
"resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",
|
backend/functions/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"name": "reader",
|
| 3 |
"scripts": {
|
| 4 |
"lint": "eslint --ext .js,.ts .",
|
| 5 |
-
"build": "tsc -p .",
|
| 6 |
"build:watch": "tsc --watch",
|
| 7 |
"build:clean": "rm -rf ./build",
|
| 8 |
"shell": "npm run build && firebase functions:shell",
|
|
@@ -44,6 +44,7 @@
|
|
| 44 |
"htmlparser2": "^9.0.0",
|
| 45 |
"jose": "^5.1.0",
|
| 46 |
"langdetect": "^0.2.1",
|
|
|
|
| 47 |
"minio": "^7.1.3",
|
| 48 |
"openai": "^4.20.0",
|
| 49 |
"puppeteer": "^22.7.1",
|
|
|
|
| 2 |
"name": "reader",
|
| 3 |
"scripts": {
|
| 4 |
"lint": "eslint --ext .js,.ts .",
|
| 5 |
+
"build": "node ./integrity-check.cjs && tsc -p .",
|
| 6 |
"build:watch": "tsc --watch",
|
| 7 |
"build:clean": "rm -rf ./build",
|
| 8 |
"shell": "npm run build && firebase functions:shell",
|
|
|
|
| 44 |
"htmlparser2": "^9.0.0",
|
| 45 |
"jose": "^5.1.0",
|
| 46 |
"langdetect": "^0.2.1",
|
| 47 |
+
"maxmind": "^4.3.18",
|
| 48 |
"minio": "^7.1.3",
|
| 49 |
"openai": "^4.20.0",
|
| 50 |
"puppeteer": "^22.7.1",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -2,7 +2,7 @@ import {
|
|
| 2 |
assignTransferProtocolMeta, marshalErrorLike,
|
| 3 |
RPCHost, RPCReflection,
|
| 4 |
HashManager,
|
| 5 |
-
AssertionFailureError, ParamValidationError,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
|
@@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 34 |
cacheValidMs = 1000 * 300;
|
| 35 |
urlValidMs = 1000 * 3600 * 4;
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
constructor(
|
| 38 |
protected globalLogger: Logger,
|
| 39 |
protected puppeteerControl: PuppeteerControl,
|
|
@@ -357,10 +363,7 @@ ${this.content}
|
|
| 357 |
[Balance left] ${latestUser.wallet.total_balance}
|
| 358 |
` : '';
|
| 359 |
|
| 360 |
-
return assignTransferProtocolMeta(`
|
| 361 |
-
[Homepage] https://jina.ai/reader
|
| 362 |
-
[Source code] https://github.com/jina-ai/reader
|
| 363 |
-
${authMixin}`,
|
| 364 |
{ contentType: 'text/plain', envelope: null }
|
| 365 |
);
|
| 366 |
}
|
|
@@ -638,13 +641,13 @@ ${authMixin}`,
|
|
| 638 |
return r;
|
| 639 |
}
|
| 640 |
|
| 641 |
-
async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
|
| 642 |
let cache;
|
| 643 |
-
if (!noCache && !crawlOpts.cookies?.length) {
|
| 644 |
cache = await this.queryCache(urlToCrawl);
|
| 645 |
}
|
| 646 |
|
| 647 |
-
if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) {
|
| 648 |
yield cache.snapshot;
|
| 649 |
|
| 650 |
return;
|
|
@@ -683,4 +686,47 @@ ${authMixin}`,
|
|
| 683 |
return undefined;
|
| 684 |
}
|
| 685 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
}
|
|
|
|
| 2 |
assignTransferProtocolMeta, marshalErrorLike,
|
| 3 |
RPCHost, RPCReflection,
|
| 4 |
HashManager,
|
| 5 |
+
AssertionFailureError, ParamValidationError, Defer,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
|
|
|
| 34 |
cacheValidMs = 1000 * 300;
|
| 35 |
urlValidMs = 1000 * 3600 * 4;
|
| 36 |
|
| 37 |
+
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
|
| 38 |
+
[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY
|
| 39 |
+
[Homepage] https://jina.ai/reader
|
| 40 |
+
[Source code] https://github.com/jina-ai/reader
|
| 41 |
+
`;
|
| 42 |
+
|
| 43 |
constructor(
|
| 44 |
protected globalLogger: Logger,
|
| 45 |
protected puppeteerControl: PuppeteerControl,
|
|
|
|
| 363 |
[Balance left] ${latestUser.wallet.total_balance}
|
| 364 |
` : '';
|
| 365 |
|
| 366 |
+
return assignTransferProtocolMeta(`${this.indexText}${authMixin}`,
|
|
|
|
|
|
|
|
|
|
| 367 |
{ contentType: 'text/plain', envelope: null }
|
| 368 |
);
|
| 369 |
}
|
|
|
|
| 641 |
return r;
|
| 642 |
}
|
| 643 |
|
| 644 |
+
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
|
| 645 |
let cache;
|
| 646 |
+
if (!noCache && !crawlOpts?.cookies?.length) {
|
| 647 |
cache = await this.queryCache(urlToCrawl);
|
| 648 |
}
|
| 649 |
|
| 650 |
+
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
| 651 |
yield cache.snapshot;
|
| 652 |
|
| 653 |
return;
|
|
|
|
| 686 |
return undefined;
|
| 687 |
}
|
| 688 |
|
| 689 |
+
|
| 690 |
+
async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
|
| 691 |
+
const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
|
| 692 |
+
|
| 693 |
+
const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
|
| 694 |
+
|
| 695 |
+
let nextDeferred = Defer();
|
| 696 |
+
let concluded = false;
|
| 697 |
+
|
| 698 |
+
const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
|
| 699 |
+
for await (const x of it) {
|
| 700 |
+
results[idx] = x;
|
| 701 |
+
|
| 702 |
+
if (x) {
|
| 703 |
+
nextDeferred.resolve();
|
| 704 |
+
nextDeferred = Defer();
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
}
|
| 708 |
+
};
|
| 709 |
+
|
| 710 |
+
Promise.all(
|
| 711 |
+
iterators.map((it, idx) => handler(it, idx))
|
| 712 |
+
).finally(() => {
|
| 713 |
+
concluded = true;
|
| 714 |
+
nextDeferred.resolve();
|
| 715 |
+
});
|
| 716 |
+
|
| 717 |
+
yield results;
|
| 718 |
+
|
| 719 |
+
try {
|
| 720 |
+
while (!concluded) {
|
| 721 |
+
await nextDeferred.promise;
|
| 722 |
+
|
| 723 |
+
yield results;
|
| 724 |
+
}
|
| 725 |
+
} finally {
|
| 726 |
+
for (const x of iterators) {
|
| 727 |
+
x.return();
|
| 728 |
+
}
|
| 729 |
+
}
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
}
|
backend/functions/src/cloud-functions/searcher.ts
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import {
|
| 2 |
+
assignTransferProtocolMeta, marshalErrorLike,
|
| 3 |
+
RPCHost, RPCReflection,
|
| 4 |
+
AssertionFailureError,
|
| 5 |
+
objHashMd5B64Of,
|
| 6 |
+
} from 'civkit';
|
| 7 |
+
import { singleton } from 'tsyringe';
|
| 8 |
+
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
+
import { RateLimitControl } from '../shared/services/rate-limit';
|
| 10 |
+
import _ from 'lodash';
|
| 11 |
+
import { ScrappingOptions } from '../services/puppeteer';
|
| 12 |
+
import { Request, Response } from 'express';
|
| 13 |
+
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 14 |
+
import { BraveSearchService } from '../services/brave-search';
|
| 15 |
+
import { CrawlerHost } from './crawler';
|
| 16 |
+
import { CookieParam } from 'puppeteer';
|
| 17 |
+
|
| 18 |
+
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 19 |
+
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 20 |
+
import { SearchResult } from '../db/searched';
|
| 21 |
+
import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@singleton()
|
| 25 |
+
export class SearcherHost extends RPCHost {
|
| 26 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 27 |
+
|
| 28 |
+
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 29 |
+
cacheValidMs = 1000 * 3600;
|
| 30 |
+
|
| 31 |
+
constructor(
|
| 32 |
+
protected globalLogger: Logger,
|
| 33 |
+
protected rateLimitControl: RateLimitControl,
|
| 34 |
+
protected threadLocal: AsyncContext,
|
| 35 |
+
protected braveSearchService: BraveSearchService,
|
| 36 |
+
protected crawler: CrawlerHost,
|
| 37 |
+
) {
|
| 38 |
+
super(...arguments);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
override async init() {
|
| 42 |
+
await this.dependencyReady();
|
| 43 |
+
|
| 44 |
+
this.emit('ready');
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
@CloudHTTPv2({
|
| 48 |
+
name: 'search2',
|
| 49 |
+
runtime: {
|
| 50 |
+
memory: '4GiB',
|
| 51 |
+
timeoutSeconds: 300,
|
| 52 |
+
concurrency: 4,
|
| 53 |
+
},
|
| 54 |
+
tags: ['Searcher'],
|
| 55 |
+
httpMethod: ['get', 'post'],
|
| 56 |
+
returnType: [String, OutputServerEventStream],
|
| 57 |
+
exposeRoot: true,
|
| 58 |
+
})
|
| 59 |
+
@CloudHTTPv2({
|
| 60 |
+
runtime: {
|
| 61 |
+
memory: '8GiB',
|
| 62 |
+
timeoutSeconds: 300,
|
| 63 |
+
concurrency: 8,
|
| 64 |
+
maxInstances: 200,
|
| 65 |
+
},
|
| 66 |
+
openapi: {
|
| 67 |
+
operation: {
|
| 68 |
+
parameters: {
|
| 69 |
+
'Accept': {
|
| 70 |
+
description: `Specifies your preference for the response format. \n\n` +
|
| 71 |
+
`Supported formats:\n` +
|
| 72 |
+
`- text/event-stream\n` +
|
| 73 |
+
`- application/json or text/json\n` +
|
| 74 |
+
`- text/plain`
|
| 75 |
+
,
|
| 76 |
+
in: 'header',
|
| 77 |
+
schema: { type: 'string' }
|
| 78 |
+
},
|
| 79 |
+
'X-No-Cache': {
|
| 80 |
+
description: `Ignores internal cache if this header is specified with a value.`,
|
| 81 |
+
in: 'header',
|
| 82 |
+
schema: { type: 'string' }
|
| 83 |
+
},
|
| 84 |
+
'X-Respond-With': {
|
| 85 |
+
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
|
| 86 |
+
`Supported formats:\n` +
|
| 87 |
+
`- markdown\n` +
|
| 88 |
+
`- html\n` +
|
| 89 |
+
`- text\n` +
|
| 90 |
+
`- screenshot\n`
|
| 91 |
+
,
|
| 92 |
+
in: 'header',
|
| 93 |
+
schema: { type: 'string' }
|
| 94 |
+
},
|
| 95 |
+
'X-Proxy-Url': {
|
| 96 |
+
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
| 97 |
+
`Supported protocols:\n` +
|
| 98 |
+
`- http\n` +
|
| 99 |
+
`- https\n` +
|
| 100 |
+
`- socks4\n` +
|
| 101 |
+
`- socks5\n\n` +
|
| 102 |
+
`For authentication, https://user:pass@host:port`,
|
| 103 |
+
in: 'header',
|
| 104 |
+
schema: { type: 'string' }
|
| 105 |
+
},
|
| 106 |
+
'X-Set-Cookie': {
|
| 107 |
+
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
| 108 |
+
`Syntax is the same with standard Set-Cookie`,
|
| 109 |
+
in: 'header',
|
| 110 |
+
schema: { type: 'string' }
|
| 111 |
+
},
|
| 112 |
+
'X-With-Generated-Alt': {
|
| 113 |
+
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
|
| 114 |
+
in: 'header',
|
| 115 |
+
schema: { type: 'string' }
|
| 116 |
+
},
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
},
|
| 120 |
+
tags: ['Searcher'],
|
| 121 |
+
httpMethod: ['get', 'post'],
|
| 122 |
+
returnType: [String, OutputServerEventStream],
|
| 123 |
+
exposeRoot: true,
|
| 124 |
+
})
|
| 125 |
+
async search(
|
| 126 |
+
@RPCReflect() rpcReflect: RPCReflection,
|
| 127 |
+
@Ctx() ctx: {
|
| 128 |
+
req: Request,
|
| 129 |
+
res: Response,
|
| 130 |
+
},
|
| 131 |
+
auth: JinaEmbeddingsAuthDTO
|
| 132 |
+
) {
|
| 133 |
+
const uid = await auth.solveUID();
|
| 134 |
+
let chargeAmount = 0;
|
| 135 |
+
const noSlashPath = ctx.req.url.slice(1);
|
| 136 |
+
if (!noSlashPath) {
|
| 137 |
+
const latestUser = uid ? await auth.assertUser() : undefined;
|
| 138 |
+
const authMixin = latestUser ? `
|
| 139 |
+
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
|
| 140 |
+
[Balance left] ${latestUser.wallet.total_balance}
|
| 141 |
+
` : '';
|
| 142 |
+
|
| 143 |
+
return assignTransferProtocolMeta(`${this.crawler.indexText}${authMixin}`,
|
| 144 |
+
{ contentType: 'text/plain', envelope: null }
|
| 145 |
+
);
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
if (uid) {
|
| 149 |
+
const user = await auth.assertUser();
|
| 150 |
+
if (!(user.wallet.total_balance > 0)) {
|
| 151 |
+
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
| 155 |
+
[
|
| 156 |
+
// 1000 requests per minute
|
| 157 |
+
new Date(Date.now() - 60 * 1000), 1000
|
| 158 |
+
]
|
| 159 |
+
);
|
| 160 |
+
|
| 161 |
+
rpcReflect.finally(() => {
|
| 162 |
+
if (chargeAmount) {
|
| 163 |
+
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
|
| 164 |
+
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
| 165 |
+
});
|
| 166 |
+
}
|
| 167 |
+
});
|
| 168 |
+
} else if (ctx.req.ip) {
|
| 169 |
+
this.threadLocal.set('ip', ctx.req.ip);
|
| 170 |
+
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
| 171 |
+
[
|
| 172 |
+
// 100 requests per minute
|
| 173 |
+
new Date(Date.now() - 60 * 1000), 100
|
| 174 |
+
]
|
| 175 |
+
);
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
const customMode = ctx.req.get('x-respond-with') || 'default';
|
| 179 |
+
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 180 |
+
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 181 |
+
const cookies: CookieParam[] = [];
|
| 182 |
+
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 183 |
+
if (Array.isArray(setCookieHeaders)) {
|
| 184 |
+
for (const setCookie of setCookieHeaders) {
|
| 185 |
+
cookies.push({
|
| 186 |
+
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
|
| 187 |
+
});
|
| 188 |
+
}
|
| 189 |
+
} else if (setCookieHeaders) {
|
| 190 |
+
cookies.push({
|
| 191 |
+
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
| 192 |
+
});
|
| 193 |
+
}
|
| 194 |
+
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
| 195 |
+
const crawlOpts: ScrappingOptions = {
|
| 196 |
+
proxyUrl: ctx.req.get('x-proxy-url'),
|
| 197 |
+
cookies,
|
| 198 |
+
favorScreenshot: customMode === 'screenshot'
|
| 199 |
+
};
|
| 200 |
+
|
| 201 |
+
const searchQuery = noSlashPath;
|
| 202 |
+
const r = await this.cachedWebSearch({
|
| 203 |
+
q: searchQuery,
|
| 204 |
+
count: 5
|
| 205 |
+
});
|
| 206 |
+
|
| 207 |
+
const urls = r.web.results.map((x) => new URL(x.url));
|
| 208 |
+
const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
|
| 209 |
+
|
| 210 |
+
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 211 |
+
const sseStream = new OutputServerEventStream();
|
| 212 |
+
rpcReflect.return(sseStream);
|
| 213 |
+
|
| 214 |
+
try {
|
| 215 |
+
for await (const scrapped of it) {
|
| 216 |
+
if (!scrapped) {
|
| 217 |
+
continue;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
chargeAmount = this.getChargeAmount(scrapped);
|
| 221 |
+
sseStream.write({
|
| 222 |
+
event: 'data',
|
| 223 |
+
data: scrapped,
|
| 224 |
+
});
|
| 225 |
+
}
|
| 226 |
+
} catch (err: any) {
|
| 227 |
+
this.logger.error(`Failed to collect search result for query ${searchQuery}`,
|
| 228 |
+
{ err: marshalErrorLike(err) }
|
| 229 |
+
);
|
| 230 |
+
sseStream.write({
|
| 231 |
+
event: 'error',
|
| 232 |
+
data: marshalErrorLike(err),
|
| 233 |
+
});
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
sseStream.end();
|
| 237 |
+
|
| 238 |
+
return sseStream;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
let lastScrapped;
|
| 242 |
+
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 243 |
+
for await (const scrapped of it) {
|
| 244 |
+
lastScrapped = scrapped;
|
| 245 |
+
|
| 246 |
+
if (!this.qualified(scrapped)) {
|
| 247 |
+
continue;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
chargeAmount = this.getChargeAmount(scrapped);
|
| 251 |
+
|
| 252 |
+
return scrapped;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
if (!lastScrapped) {
|
| 256 |
+
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
chargeAmount = this.getChargeAmount(lastScrapped);
|
| 260 |
+
|
| 261 |
+
return lastScrapped;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
for await (const scrapped of it) {
|
| 265 |
+
lastScrapped = scrapped;
|
| 266 |
+
|
| 267 |
+
if (!this.qualified(scrapped)) {
|
| 268 |
+
continue;
|
| 269 |
+
}
|
| 270 |
+
chargeAmount = this.getChargeAmount(scrapped);
|
| 271 |
+
|
| 272 |
+
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
if (!lastScrapped) {
|
| 276 |
+
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
chargeAmount = this.getChargeAmount(lastScrapped);
|
| 280 |
+
|
| 281 |
+
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
| 285 |
+
urls: URL[], options?: ScrappingOptions, noCache = false) {
|
| 286 |
+
|
| 287 |
+
for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
|
| 288 |
+
const mapped = scrapped.map((x, i) => {
|
| 289 |
+
if (!x) {
|
| 290 |
+
const p = {
|
| 291 |
+
toString() {
|
| 292 |
+
return `[${i + 1}] No content available for ${urls[i]}`;
|
| 293 |
+
}
|
| 294 |
+
};
|
| 295 |
+
const r = Object.create(p);
|
| 296 |
+
r.url = urls[i].toString();
|
| 297 |
+
|
| 298 |
+
return r;
|
| 299 |
+
}
|
| 300 |
+
return this.crawler.formatSnapshot(mode, x, urls[i]);
|
| 301 |
+
});
|
| 302 |
+
|
| 303 |
+
const resultArray = await Promise.all(mapped);
|
| 304 |
+
for (const [i, result] of resultArray.entries()) {
|
| 305 |
+
if (result && typeof result === 'object' && Object.hasOwn(result, 'toString')) {
|
| 306 |
+
result.toString = function (this: any) {
|
| 307 |
+
const mixins = [];
|
| 308 |
+
if (this.publishedTime) {
|
| 309 |
+
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
if (mode === 'markdown') {
|
| 313 |
+
return `[${i + 1}]\n${this.content}`;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
return `[${i + 1}] Title: ${this.title}
|
| 317 |
+
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
| 318 |
+
[${i + 1}] Markdown Content:
|
| 319 |
+
${this.content}
|
| 320 |
+
`;
|
| 321 |
+
};
|
| 322 |
+
}
|
| 323 |
+
}
|
| 324 |
+
resultArray.toString = function () {
|
| 325 |
+
return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n';
|
| 326 |
+
};
|
| 327 |
+
|
| 328 |
+
yield resultArray;
|
| 329 |
+
}
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
getChargeAmount(formatted: any[]) {
|
| 333 |
+
return _.sum(
|
| 334 |
+
formatted.map((x) => this.crawler.getChargeAmount(x) || 0)
|
| 335 |
+
);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
qualified(scrapped: any[]) {
|
| 339 |
+
return _.every(scrapped, (x) =>
|
| 340 |
+
(x as any)?.title &&
|
| 341 |
+
(
|
| 342 |
+
(x as any).content ||
|
| 343 |
+
(x as any).screenShotUrl ||
|
| 344 |
+
(x as any).screenshot ||
|
| 345 |
+
(x as any).text ||
|
| 346 |
+
(x as any).html
|
| 347 |
+
)
|
| 348 |
+
);
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
| 352 |
+
const queryDigest = objHashMd5B64Of(query);
|
| 353 |
+
let cache;
|
| 354 |
+
if (!noCache) {
|
| 355 |
+
cache = (await SearchResult.fromFirestoreQuery(
|
| 356 |
+
SearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
|
| 357 |
+
.orderBy('createdAt', 'desc')
|
| 358 |
+
.limit(1)
|
| 359 |
+
))[0];
|
| 360 |
+
if (cache) {
|
| 361 |
+
const age = Date.now() - cache.createdAt.valueOf();
|
| 362 |
+
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
|
| 363 |
+
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
|
| 364 |
+
query, digest: queryDigest, age, stale
|
| 365 |
+
});
|
| 366 |
+
|
| 367 |
+
if (!stale) {
|
| 368 |
+
return cache.response as WebSearchApiResponse;
|
| 369 |
+
}
|
| 370 |
+
}
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
const r = await this.braveSearchService.webSearch(query);
|
| 374 |
+
|
| 375 |
+
const nowDate = new Date();
|
| 376 |
+
const record = SearchResult.from({
|
| 377 |
+
query,
|
| 378 |
+
queryDigest,
|
| 379 |
+
response: r,
|
| 380 |
+
createdAt: nowDate,
|
| 381 |
+
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
| 382 |
+
});
|
| 383 |
+
SearchResult.save(record).catch((err) => {
|
| 384 |
+
this.logger.warn(`Failed to cache search result`, { err });
|
| 385 |
+
});
|
| 386 |
+
|
| 387 |
+
return r;
|
| 388 |
+
}
|
| 389 |
+
}
|
backend/functions/src/db/searched.ts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Also, parseJSONText, Prop } from 'civkit';
|
| 2 |
+
import { FirestoreRecord } from '../shared/lib/firestore';
|
| 3 |
+
import _ from 'lodash';
|
| 4 |
+
|
| 5 |
+
@Also({
|
| 6 |
+
dictOf: Object
|
| 7 |
+
})
|
| 8 |
+
export class SearchResult extends FirestoreRecord {
|
| 9 |
+
static override collectionName = 'searchResults';
|
| 10 |
+
|
| 11 |
+
override _id!: string;
|
| 12 |
+
|
| 13 |
+
@Prop({
|
| 14 |
+
required: true
|
| 15 |
+
})
|
| 16 |
+
query!: any;
|
| 17 |
+
|
| 18 |
+
@Prop({
|
| 19 |
+
required: true
|
| 20 |
+
})
|
| 21 |
+
queryDigest!: string;
|
| 22 |
+
|
| 23 |
+
@Prop()
|
| 24 |
+
response?: any;
|
| 25 |
+
|
| 26 |
+
@Prop()
|
| 27 |
+
createdAt!: Date;
|
| 28 |
+
|
| 29 |
+
@Prop()
|
| 30 |
+
expireAt?: Date;
|
| 31 |
+
|
| 32 |
+
[k: string]: any;
|
| 33 |
+
|
| 34 |
+
static patchedFields = [
|
| 35 |
+
'query',
|
| 36 |
+
'response',
|
| 37 |
+
];
|
| 38 |
+
|
| 39 |
+
static override from(input: any) {
|
| 40 |
+
for (const field of this.patchedFields) {
|
| 41 |
+
if (typeof input[field] === 'string') {
|
| 42 |
+
input[field] = parseJSONText(input[field]);
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
return super.from(input) as SearchResult;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
override degradeForFireStore() {
|
| 50 |
+
const copy: any = { ...this };
|
| 51 |
+
|
| 52 |
+
for (const field of (this.constructor as typeof SearchResult).patchedFields) {
|
| 53 |
+
if (typeof copy[field] === 'object') {
|
| 54 |
+
copy[field] = JSON.stringify(copy[field]) as any;
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
return copy;
|
| 59 |
+
}
|
| 60 |
+
}
|
backend/functions/src/services/brave-search.ts
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { AsyncService, DownstreamServiceFailureError } from 'civkit';
|
| 2 |
+
import { singleton } from 'tsyringe';
|
| 3 |
+
import { Logger } from '../shared/services/logger';
|
| 4 |
+
import { SecretExposer } from '../shared/services/secrets';
|
| 5 |
+
import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 6 |
+
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
| 7 |
+
import { AsyncContext } from '../shared';
|
| 8 |
+
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
| 9 |
+
|
| 10 |
+
@singleton()
|
| 11 |
+
export class BraveSearchService extends AsyncService {
|
| 12 |
+
|
| 13 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 14 |
+
|
| 15 |
+
braveSearchHTTP!: BraveSearchHTTP;
|
| 16 |
+
|
| 17 |
+
constructor(
|
| 18 |
+
protected globalLogger: Logger,
|
| 19 |
+
protected secretExposer: SecretExposer,
|
| 20 |
+
protected geoipControl: GeoIPService,
|
| 21 |
+
protected threadLocal: AsyncContext,
|
| 22 |
+
) {
|
| 23 |
+
super(...arguments);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
override async init() {
|
| 27 |
+
await this.dependencyReady();
|
| 28 |
+
this.emit('ready');
|
| 29 |
+
|
| 30 |
+
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
async webSearch(query: WebSearchQueryParams) {
|
| 34 |
+
const ip = this.threadLocal.get('ip');
|
| 35 |
+
const extraHeaders: WebSearchOptionalHeaderOptions = {};
|
| 36 |
+
if (ip) {
|
| 37 |
+
const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
|
| 38 |
+
|
| 39 |
+
if (geoip?.city) {
|
| 40 |
+
extraHeaders['X-Loc-City'] = geoip.city;
|
| 41 |
+
}
|
| 42 |
+
if (geoip?.country) {
|
| 43 |
+
extraHeaders['X-Loc-Country'] = geoip.country.code;
|
| 44 |
+
}
|
| 45 |
+
if (geoip?.timezone) {
|
| 46 |
+
extraHeaders['X-Loc-Timezone'] = geoip.timezone;
|
| 47 |
+
}
|
| 48 |
+
if (geoip?.coordinates) {
|
| 49 |
+
extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`;
|
| 50 |
+
extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`;
|
| 51 |
+
}
|
| 52 |
+
if (geoip?.subdivisions?.length) {
|
| 53 |
+
extraHeaders['X-Loc-State'] = geoip.subdivisions[0].code;
|
| 54 |
+
extraHeaders['X-Loc-State-Name'] = geoip.subdivisions[0].name;
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
if (this.threadLocal.get('userAgent')) {
|
| 58 |
+
extraHeaders['User-Agent'] = this.threadLocal.get('userAgent');
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
try {
|
| 62 |
+
const r = await this.braveSearchHTTP.webSearch(query, { headers: extraHeaders as Record<string, string> });
|
| 63 |
+
|
| 64 |
+
return r.parsed;
|
| 65 |
+
} catch (err) {
|
| 66 |
+
throw new DownstreamServiceFailureError({ message: `Search failed`, cause: err });
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
}
|
backend/functions/src/services/geoip.ts
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { container, singleton } from 'tsyringe';
|
| 2 |
+
import fsp from 'fs/promises';
|
| 3 |
+
import { CityResponse, Reader } from 'maxmind';
|
| 4 |
+
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
|
| 5 |
+
import { Logger } from '../shared';
|
| 6 |
+
import path from 'path';
|
| 7 |
+
|
| 8 |
+
export enum GEOIP_SUPPORTED_LANGUAGES {
|
| 9 |
+
EN = 'en',
|
| 10 |
+
ZH_CN = 'zh-CN',
|
| 11 |
+
JA = 'ja',
|
| 12 |
+
DE = 'de',
|
| 13 |
+
FR = 'fr',
|
| 14 |
+
ES = 'es',
|
| 15 |
+
PT_BR = 'pt-BR',
|
| 16 |
+
RU = 'ru',
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
export class GeoIPInfo extends AutoCastable {
|
| 20 |
+
@Prop()
|
| 21 |
+
code?: string;
|
| 22 |
+
|
| 23 |
+
@Prop()
|
| 24 |
+
name?: string;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
export class GeoIPCountryInfo extends GeoIPInfo {
|
| 28 |
+
@Prop()
|
| 29 |
+
eu?: boolean;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
export class GeoIPCityResponse extends AutoCastable {
|
| 33 |
+
@Prop()
|
| 34 |
+
continent?: GeoIPInfo;
|
| 35 |
+
|
| 36 |
+
@Prop()
|
| 37 |
+
country?: GeoIPCountryInfo;
|
| 38 |
+
|
| 39 |
+
@Prop({
|
| 40 |
+
arrayOf: GeoIPInfo
|
| 41 |
+
})
|
| 42 |
+
subdivisions?: GeoIPInfo[];
|
| 43 |
+
|
| 44 |
+
@Prop()
|
| 45 |
+
city?: string;
|
| 46 |
+
|
| 47 |
+
@Prop({
|
| 48 |
+
arrayOf: Number
|
| 49 |
+
})
|
| 50 |
+
coordinates?: [number, number, number];
|
| 51 |
+
|
| 52 |
+
@Prop()
|
| 53 |
+
timezone?: string;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
@singleton()
|
| 57 |
+
export class GeoIPService extends AsyncService {
|
| 58 |
+
|
| 59 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 60 |
+
|
| 61 |
+
mmdbCity!: Reader<CityResponse>;
|
| 62 |
+
|
| 63 |
+
constructor(
|
| 64 |
+
protected globalLogger: Logger,
|
| 65 |
+
) {
|
| 66 |
+
super(...arguments);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
override async init() {
|
| 71 |
+
await this.dependencyReady();
|
| 72 |
+
|
| 73 |
+
this.emit('ready');
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
@runOnce()
|
| 77 |
+
async _lazyload() {
|
| 78 |
+
const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb');
|
| 79 |
+
|
| 80 |
+
const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null });
|
| 81 |
+
|
| 82 |
+
this.mmdbCity = new Reader<CityResponse>(dbBuff);
|
| 83 |
+
|
| 84 |
+
this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
|
| 89 |
+
await this._lazyload();
|
| 90 |
+
|
| 91 |
+
const r = this.mmdbCity.get(ip);
|
| 92 |
+
|
| 93 |
+
if (!r) {
|
| 94 |
+
return undefined;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
return GeoIPCityResponse.from({
|
| 98 |
+
continent: r.continent ? {
|
| 99 |
+
code: r.continent?.code,
|
| 100 |
+
name: r.continent?.names?.[lang] || r.continent?.names?.en,
|
| 101 |
+
} : undefined,
|
| 102 |
+
country: r.country ? {
|
| 103 |
+
code: r.country?.iso_code,
|
| 104 |
+
name: r.country?.names?.[lang] || r.country?.names.en,
|
| 105 |
+
eu: r.country?.is_in_european_union,
|
| 106 |
+
} : undefined,
|
| 107 |
+
city: r.city?.names?.[lang] || r.city?.names?.en,
|
| 108 |
+
subdivisions: r.subdivisions?.map((x) => ({
|
| 109 |
+
code: x.iso_code,
|
| 110 |
+
name: x.names?.[lang] || x.names?.en,
|
| 111 |
+
})),
|
| 112 |
+
coordinates: r.location ? [
|
| 113 |
+
r.location.latitude, r.location.longitude, r.location.accuracy_radius
|
| 114 |
+
] : undefined,
|
| 115 |
+
timezone: r.location?.time_zone,
|
| 116 |
+
});
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
const instance = container.resolve(GeoIPService);
|
| 122 |
+
|
| 123 |
+
export default instance;
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -278,7 +278,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 278 |
return page;
|
| 279 |
}
|
| 280 |
|
| 281 |
-
async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
| 282 |
// parsedUrl.search = '';
|
| 283 |
const url = parsedUrl.toString();
|
| 284 |
|
|
@@ -287,10 +287,10 @@ document.addEventListener('load', handlePageLoad);
|
|
| 287 |
let screenshot: Buffer | undefined;
|
| 288 |
|
| 289 |
const page = await this.pagePool.acquire();
|
| 290 |
-
if (options.proxyUrl) {
|
| 291 |
await page.useProxy(options.proxyUrl);
|
| 292 |
}
|
| 293 |
-
if (options.cookies) {
|
| 294 |
await page.setCookie(...options.cookies);
|
| 295 |
}
|
| 296 |
|
|
@@ -353,7 +353,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 353 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 354 |
break;
|
| 355 |
}
|
| 356 |
-
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 357 |
screenshot = await page.screenshot();
|
| 358 |
lastHTML = snapshot.html;
|
| 359 |
}
|
|
|
|
| 278 |
return page;
|
| 279 |
}
|
| 280 |
|
| 281 |
+
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
| 282 |
// parsedUrl.search = '';
|
| 283 |
const url = parsedUrl.toString();
|
| 284 |
|
|
|
|
| 287 |
let screenshot: Buffer | undefined;
|
| 288 |
|
| 289 |
const page = await this.pagePool.acquire();
|
| 290 |
+
if (options?.proxyUrl) {
|
| 291 |
await page.useProxy(options.proxyUrl);
|
| 292 |
}
|
| 293 |
+
if (options?.cookies) {
|
| 294 |
await page.setCookie(...options.cookies);
|
| 295 |
}
|
| 296 |
|
|
|
|
| 353 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 354 |
break;
|
| 355 |
}
|
| 356 |
+
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 357 |
screenshot = await page.screenshot();
|
| 358 |
lastHTML = snapshot.html;
|
| 359 |
}
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 2f2cdcff7b2738be33ee5aca858ef2d65eba29ed
|