nomagick commited on
Commit
2e3c217
·
unverified ·
1 Parent(s): f171e54

feat: web search (#57)

Browse files
backend/.gitignore CHANGED
@@ -75,4 +75,5 @@ build/
75
  .DS_Store
76
 
77
  *.local
78
- .secret.*
 
 
75
  .DS_Store
76
 
77
  *.local
78
+ .secret.*
79
+ licensed/
backend/functions/integrity-check.cjs ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env node
2
+
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+
6
+ const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');
7
+
8
+ if (!fs.existsSync(file)) {
9
+ console.error(`Integrity check failed: ${file} does not exist.`);
10
+ process.exit(1);
11
+ }
backend/functions/package-lock.json CHANGED
@@ -24,6 +24,7 @@
24
  "htmlparser2": "^9.0.0",
25
  "jose": "^5.1.0",
26
  "langdetect": "^0.2.1",
 
27
  "minio": "^7.1.3",
28
  "openai": "^4.20.0",
29
  "puppeteer": "^22.7.1",
@@ -8144,6 +8145,19 @@
8144
  "tmpl": "1.0.5"
8145
  }
8146
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
8147
  "node_modules/media-typer": {
8148
  "version": "0.3.0",
8149
  "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
@@ -8375,6 +8389,15 @@
8375
  "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
8376
  "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
8377
  },
 
 
 
 
 
 
 
 
 
8378
  "node_modules/mongodb": {
8379
  "version": "5.9.2",
8380
  "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
@@ -11059,6 +11082,14 @@
11059
  "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
11060
  "integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
11061
  },
 
 
 
 
 
 
 
 
11062
  "node_modules/tld-extract": {
11063
  "version": "2.1.0",
11064
  "resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",
 
24
  "htmlparser2": "^9.0.0",
25
  "jose": "^5.1.0",
26
  "langdetect": "^0.2.1",
27
+ "maxmind": "^4.3.18",
28
  "minio": "^7.1.3",
29
  "openai": "^4.20.0",
30
  "puppeteer": "^22.7.1",
 
8145
  "tmpl": "1.0.5"
8146
  }
8147
  },
8148
+ "node_modules/maxmind": {
8149
+ "version": "4.3.18",
8150
+ "resolved": "https://registry.npmjs.org/maxmind/-/maxmind-4.3.18.tgz",
8151
+ "integrity": "sha512-5b9utU7ZxcGYTBaO7hCF0FXyfw3IpankLn+FnLW4RZS1zi97RBeSdfXJFJlk5UxNsMiFZlsdMT3lzvD+bD8MLQ==",
8152
+ "dependencies": {
8153
+ "mmdb-lib": "2.1.0",
8154
+ "tiny-lru": "11.2.5"
8155
+ },
8156
+ "engines": {
8157
+ "node": ">=12",
8158
+ "npm": ">=6"
8159
+ }
8160
+ },
8161
  "node_modules/media-typer": {
8162
  "version": "0.3.0",
8163
  "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
 
8389
  "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
8390
  "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
8391
  },
8392
+ "node_modules/mmdb-lib": {
8393
+ "version": "2.1.0",
8394
+ "resolved": "https://registry.npmjs.org/mmdb-lib/-/mmdb-lib-2.1.0.tgz",
8395
+ "integrity": "sha512-tdDTZmnI5G4UoSctv2KxM/3VQt2XRj4CmR5R4VsAWsOUcS3LysHR34wtixWm/pXxXdkBDuN92auxkC0T2+qd1Q==",
8396
+ "engines": {
8397
+ "node": ">=10",
8398
+ "npm": ">=6"
8399
+ }
8400
+ },
8401
  "node_modules/mongodb": {
8402
  "version": "5.9.2",
8403
  "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
 
11082
  "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
11083
  "integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
11084
  },
11085
+ "node_modules/tiny-lru": {
11086
+ "version": "11.2.5",
11087
+ "resolved": "https://registry.npmjs.org/tiny-lru/-/tiny-lru-11.2.5.tgz",
11088
+ "integrity": "sha512-JpqM0K33lG6iQGKiigcwuURAKZlq6rHXfrgeL4/I8/REoyJTGU+tEMszvT/oTRVHG2OiylhGDjqPp1jWMlr3bw==",
11089
+ "engines": {
11090
+ "node": ">=12"
11091
+ }
11092
+ },
11093
  "node_modules/tld-extract": {
11094
  "version": "2.1.0",
11095
  "resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",
backend/functions/package.json CHANGED
@@ -2,7 +2,7 @@
2
  "name": "reader",
3
  "scripts": {
4
  "lint": "eslint --ext .js,.ts .",
5
- "build": "tsc -p .",
6
  "build:watch": "tsc --watch",
7
  "build:clean": "rm -rf ./build",
8
  "shell": "npm run build && firebase functions:shell",
@@ -44,6 +44,7 @@
44
  "htmlparser2": "^9.0.0",
45
  "jose": "^5.1.0",
46
  "langdetect": "^0.2.1",
 
47
  "minio": "^7.1.3",
48
  "openai": "^4.20.0",
49
  "puppeteer": "^22.7.1",
 
2
  "name": "reader",
3
  "scripts": {
4
  "lint": "eslint --ext .js,.ts .",
5
+ "build": "node ./integrity-check.cjs && tsc -p .",
6
  "build:watch": "tsc --watch",
7
  "build:clean": "rm -rf ./build",
8
  "shell": "npm run build && firebase functions:shell",
 
44
  "htmlparser2": "^9.0.0",
45
  "jose": "^5.1.0",
46
  "langdetect": "^0.2.1",
47
+ "maxmind": "^4.3.18",
48
  "minio": "^7.1.3",
49
  "openai": "^4.20.0",
50
  "puppeteer": "^22.7.1",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -2,7 +2,7 @@ import {
2
  assignTransferProtocolMeta, marshalErrorLike,
3
  RPCHost, RPCReflection,
4
  HashManager,
5
- AssertionFailureError, ParamValidationError,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
  import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
@@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost {
34
  cacheValidMs = 1000 * 300;
35
  urlValidMs = 1000 * 3600 * 4;
36
 
 
 
 
 
 
 
37
  constructor(
38
  protected globalLogger: Logger,
39
  protected puppeteerControl: PuppeteerControl,
@@ -357,10 +363,7 @@ ${this.content}
357
  [Balance left] ${latestUser.wallet.total_balance}
358
  ` : '';
359
 
360
- return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
361
- [Homepage] https://jina.ai/reader
362
- [Source code] https://github.com/jina-ai/reader
363
- ${authMixin}`,
364
  { contentType: 'text/plain', envelope: null }
365
  );
366
  }
@@ -638,13 +641,13 @@ ${authMixin}`,
638
  return r;
639
  }
640
 
641
- async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
642
  let cache;
643
- if (!noCache && !crawlOpts.cookies?.length) {
644
  cache = await this.queryCache(urlToCrawl);
645
  }
646
 
647
- if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) {
648
  yield cache.snapshot;
649
 
650
  return;
@@ -683,4 +686,47 @@ ${authMixin}`,
683
  return undefined;
684
  }
685
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
  }
 
2
  assignTransferProtocolMeta, marshalErrorLike,
3
  RPCHost, RPCReflection,
4
  HashManager,
5
+ AssertionFailureError, ParamValidationError, Defer,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
  import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
 
34
  cacheValidMs = 1000 * 300;
35
  urlValidMs = 1000 * 3600 * 4;
36
 
37
+ indexText = `[Usage1] https://r.jina.ai/YOUR_URL
38
+ [Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY
39
+ [Homepage] https://jina.ai/reader
40
+ [Source code] https://github.com/jina-ai/reader
41
+ `;
42
+
43
  constructor(
44
  protected globalLogger: Logger,
45
  protected puppeteerControl: PuppeteerControl,
 
363
  [Balance left] ${latestUser.wallet.total_balance}
364
  ` : '';
365
 
366
+ return assignTransferProtocolMeta(`${this.indexText}${authMixin}`,
 
 
 
367
  { contentType: 'text/plain', envelope: null }
368
  );
369
  }
 
641
  return r;
642
  }
643
 
644
+ async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
645
  let cache;
646
+ if (!noCache && !crawlOpts?.cookies?.length) {
647
  cache = await this.queryCache(urlToCrawl);
648
  }
649
 
650
+ if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
651
  yield cache.snapshot;
652
 
653
  return;
 
686
  return undefined;
687
  }
688
 
689
+
690
+ async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
691
+ const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
692
+
693
+ const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
694
+
695
+ let nextDeferred = Defer();
696
+ let concluded = false;
697
+
698
+ const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
699
+ for await (const x of it) {
700
+ results[idx] = x;
701
+
702
+ if (x) {
703
+ nextDeferred.resolve();
704
+ nextDeferred = Defer();
705
+ }
706
+
707
+ }
708
+ };
709
+
710
+ Promise.all(
711
+ iterators.map((it, idx) => handler(it, idx))
712
+ ).finally(() => {
713
+ concluded = true;
714
+ nextDeferred.resolve();
715
+ });
716
+
717
+ yield results;
718
+
719
+ try {
720
+ while (!concluded) {
721
+ await nextDeferred.promise;
722
+
723
+ yield results;
724
+ }
725
+ } finally {
726
+ for (const x of iterators) {
727
+ x.return();
728
+ }
729
+ }
730
+ }
731
+
732
  }
backend/functions/src/cloud-functions/searcher.ts ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ assignTransferProtocolMeta, marshalErrorLike,
3
+ RPCHost, RPCReflection,
4
+ AssertionFailureError,
5
+ objHashMd5B64Of,
6
+ } from 'civkit';
7
+ import { singleton } from 'tsyringe';
8
+ import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
+ import { RateLimitControl } from '../shared/services/rate-limit';
10
+ import _ from 'lodash';
11
+ import { ScrappingOptions } from '../services/puppeteer';
12
+ import { Request, Response } from 'express';
13
+ import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
14
+ import { BraveSearchService } from '../services/brave-search';
15
+ import { CrawlerHost } from './crawler';
16
+ import { CookieParam } from 'puppeteer';
17
+
18
+ import { parseString as parseSetCookieString } from 'set-cookie-parser';
19
+ import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
20
+ import { SearchResult } from '../db/searched';
21
+ import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
22
+
23
+
24
+ @singleton()
25
+ export class SearcherHost extends RPCHost {
26
+ logger = this.globalLogger.child({ service: this.constructor.name });
27
+
28
+ cacheRetentionMs = 1000 * 3600 * 24 * 7;
29
+ cacheValidMs = 1000 * 3600;
30
+
31
+ constructor(
32
+ protected globalLogger: Logger,
33
+ protected rateLimitControl: RateLimitControl,
34
+ protected threadLocal: AsyncContext,
35
+ protected braveSearchService: BraveSearchService,
36
+ protected crawler: CrawlerHost,
37
+ ) {
38
+ super(...arguments);
39
+ }
40
+
41
+ override async init() {
42
+ await this.dependencyReady();
43
+
44
+ this.emit('ready');
45
+ }
46
+
47
+ @CloudHTTPv2({
48
+ name: 'search2',
49
+ runtime: {
50
+ memory: '4GiB',
51
+ timeoutSeconds: 300,
52
+ concurrency: 4,
53
+ },
54
+ tags: ['Searcher'],
55
+ httpMethod: ['get', 'post'],
56
+ returnType: [String, OutputServerEventStream],
57
+ exposeRoot: true,
58
+ })
59
+ @CloudHTTPv2({
60
+ runtime: {
61
+ memory: '8GiB',
62
+ timeoutSeconds: 300,
63
+ concurrency: 8,
64
+ maxInstances: 200,
65
+ },
66
+ openapi: {
67
+ operation: {
68
+ parameters: {
69
+ 'Accept': {
70
+ description: `Specifies your preference for the response format. \n\n` +
71
+ `Supported formats:\n` +
72
+ `- text/event-stream\n` +
73
+ `- application/json or text/json\n` +
74
+ `- text/plain`
75
+ ,
76
+ in: 'header',
77
+ schema: { type: 'string' }
78
+ },
79
+ 'X-No-Cache': {
80
+ description: `Ignores internal cache if this header is specified with a value.`,
81
+ in: 'header',
82
+ schema: { type: 'string' }
83
+ },
84
+ 'X-Respond-With': {
85
+ description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
86
+ `Supported formats:\n` +
87
+ `- markdown\n` +
88
+ `- html\n` +
89
+ `- text\n` +
90
+ `- screenshot\n`
91
+ ,
92
+ in: 'header',
93
+ schema: { type: 'string' }
94
+ },
95
+ 'X-Proxy-Url': {
96
+ description: `Specifies your custom proxy if you prefer to use one. \n\n` +
97
+ `Supported protocols:\n` +
98
+ `- http\n` +
99
+ `- https\n` +
100
+ `- socks4\n` +
101
+ `- socks5\n\n` +
102
+ `For authentication, https://user:pass@host:port`,
103
+ in: 'header',
104
+ schema: { type: 'string' }
105
+ },
106
+ 'X-Set-Cookie': {
107
+ description: `Sets cookie(s) to the headless browser for your request. \n\n` +
108
+ `Syntax is the same with standard Set-Cookie`,
109
+ in: 'header',
110
+ schema: { type: 'string' }
111
+ },
112
+ 'X-With-Generated-Alt': {
113
+ description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
114
+ in: 'header',
115
+ schema: { type: 'string' }
116
+ },
117
+ }
118
+ }
119
+ },
120
+ tags: ['Searcher'],
121
+ httpMethod: ['get', 'post'],
122
+ returnType: [String, OutputServerEventStream],
123
+ exposeRoot: true,
124
+ })
125
+ async search(
126
+ @RPCReflect() rpcReflect: RPCReflection,
127
+ @Ctx() ctx: {
128
+ req: Request,
129
+ res: Response,
130
+ },
131
+ auth: JinaEmbeddingsAuthDTO
132
+ ) {
133
+ const uid = await auth.solveUID();
134
+ let chargeAmount = 0;
135
+ const noSlashPath = ctx.req.url.slice(1);
136
+ if (!noSlashPath) {
137
+ const latestUser = uid ? await auth.assertUser() : undefined;
138
+ const authMixin = latestUser ? `
139
+ [Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
140
+ [Balance left] ${latestUser.wallet.total_balance}
141
+ ` : '';
142
+
143
+ return assignTransferProtocolMeta(`${this.crawler.indexText}${authMixin}`,
144
+ { contentType: 'text/plain', envelope: null }
145
+ );
146
+ }
147
+
148
+ if (uid) {
149
+ const user = await auth.assertUser();
150
+ if (!(user.wallet.total_balance > 0)) {
151
+ throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
152
+ }
153
+
154
+ await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
155
+ [
156
+ // 1000 requests per minute
157
+ new Date(Date.now() - 60 * 1000), 1000
158
+ ]
159
+ );
160
+
161
+ rpcReflect.finally(() => {
162
+ if (chargeAmount) {
163
+ auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
164
+ this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
165
+ });
166
+ }
167
+ });
168
+ } else if (ctx.req.ip) {
169
+ this.threadLocal.set('ip', ctx.req.ip);
170
+ await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
171
+ [
172
+ // 100 requests per minute
173
+ new Date(Date.now() - 60 * 1000), 100
174
+ ]
175
+ );
176
+ }
177
+
178
+ const customMode = ctx.req.get('x-respond-with') || 'default';
179
+ const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
180
+ const noCache = Boolean(ctx.req.get('x-no-cache'));
181
+ const cookies: CookieParam[] = [];
182
+ const setCookieHeaders = ctx.req.headers['x-set-cookie'];
183
+ if (Array.isArray(setCookieHeaders)) {
184
+ for (const setCookie of setCookieHeaders) {
185
+ cookies.push({
186
+ ...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
187
+ });
188
+ }
189
+ } else if (setCookieHeaders) {
190
+ cookies.push({
191
+ ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
192
+ });
193
+ }
194
+ this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
195
+ const crawlOpts: ScrappingOptions = {
196
+ proxyUrl: ctx.req.get('x-proxy-url'),
197
+ cookies,
198
+ favorScreenshot: customMode === 'screenshot'
199
+ };
200
+
201
+ const searchQuery = noSlashPath;
202
+ const r = await this.cachedWebSearch({
203
+ q: searchQuery,
204
+ count: 5
205
+ });
206
+
207
+ const urls = r.web.results.map((x) => new URL(x.url));
208
+ const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
209
+
210
+ if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
211
+ const sseStream = new OutputServerEventStream();
212
+ rpcReflect.return(sseStream);
213
+
214
+ try {
215
+ for await (const scrapped of it) {
216
+ if (!scrapped) {
217
+ continue;
218
+ }
219
+
220
+ chargeAmount = this.getChargeAmount(scrapped);
221
+ sseStream.write({
222
+ event: 'data',
223
+ data: scrapped,
224
+ });
225
+ }
226
+ } catch (err: any) {
227
+ this.logger.error(`Failed to collect search result for query ${searchQuery}`,
228
+ { err: marshalErrorLike(err) }
229
+ );
230
+ sseStream.write({
231
+ event: 'error',
232
+ data: marshalErrorLike(err),
233
+ });
234
+ }
235
+
236
+ sseStream.end();
237
+
238
+ return sseStream;
239
+ }
240
+
241
+ let lastScrapped;
242
+ if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
243
+ for await (const scrapped of it) {
244
+ lastScrapped = scrapped;
245
+
246
+ if (!this.qualified(scrapped)) {
247
+ continue;
248
+ }
249
+
250
+ chargeAmount = this.getChargeAmount(scrapped);
251
+
252
+ return scrapped;
253
+ }
254
+
255
+ if (!lastScrapped) {
256
+ throw new AssertionFailureError(`No content available for query ${searchQuery}`);
257
+ }
258
+
259
+ chargeAmount = this.getChargeAmount(lastScrapped);
260
+
261
+ return lastScrapped;
262
+ }
263
+
264
+ for await (const scrapped of it) {
265
+ lastScrapped = scrapped;
266
+
267
+ if (!this.qualified(scrapped)) {
268
+ continue;
269
+ }
270
+ chargeAmount = this.getChargeAmount(scrapped);
271
+
272
+ return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
273
+ }
274
+
275
+ if (!lastScrapped) {
276
+ throw new AssertionFailureError(`No content available for query ${searchQuery}`);
277
+ }
278
+
279
+ chargeAmount = this.getChargeAmount(lastScrapped);
280
+
281
+ return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
282
+ }
283
+
284
+ async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
285
+ urls: URL[], options?: ScrappingOptions, noCache = false) {
286
+
287
+ for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
288
+ const mapped = scrapped.map((x, i) => {
289
+ if (!x) {
290
+ const p = {
291
+ toString() {
292
+ return `[${i + 1}] No content available for ${urls[i]}`;
293
+ }
294
+ };
295
+ const r = Object.create(p);
296
+ r.url = urls[i].toString();
297
+
298
+ return r;
299
+ }
300
+ return this.crawler.formatSnapshot(mode, x, urls[i]);
301
+ });
302
+
303
+ const resultArray = await Promise.all(mapped);
304
+ for (const [i, result] of resultArray.entries()) {
305
+ if (result && typeof result === 'object' && Object.hasOwn(result, 'toString')) {
306
+ result.toString = function (this: any) {
307
+ const mixins = [];
308
+ if (this.publishedTime) {
309
+ mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
310
+ }
311
+
312
+ if (mode === 'markdown') {
313
+ return `[${i + 1}]\n${this.content}`;
314
+ }
315
+
316
+ return `[${i + 1}] Title: ${this.title}
317
+ [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
318
+ [${i + 1}] Markdown Content:
319
+ ${this.content}
320
+ `;
321
+ };
322
+ }
323
+ }
324
+ resultArray.toString = function () {
325
+ return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n';
326
+ };
327
+
328
+ yield resultArray;
329
+ }
330
+ }
331
+
332
+ getChargeAmount(formatted: any[]) {
333
+ return _.sum(
334
+ formatted.map((x) => this.crawler.getChargeAmount(x) || 0)
335
+ );
336
+ }
337
+
338
+ qualified(scrapped: any[]) {
339
+ return _.every(scrapped, (x) =>
340
+ (x as any)?.title &&
341
+ (
342
+ (x as any).content ||
343
+ (x as any).screenShotUrl ||
344
+ (x as any).screenshot ||
345
+ (x as any).text ||
346
+ (x as any).html
347
+ )
348
+ );
349
+ }
350
+
351
+ async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
352
+ const queryDigest = objHashMd5B64Of(query);
353
+ let cache;
354
+ if (!noCache) {
355
+ cache = (await SearchResult.fromFirestoreQuery(
356
+ SearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
357
+ .orderBy('createdAt', 'desc')
358
+ .limit(1)
359
+ ))[0];
360
+ if (cache) {
361
+ const age = Date.now() - cache.createdAt.valueOf();
362
+ const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
363
+ this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
364
+ query, digest: queryDigest, age, stale
365
+ });
366
+
367
+ if (!stale) {
368
+ return cache.response as WebSearchApiResponse;
369
+ }
370
+ }
371
+ }
372
+
373
+ const r = await this.braveSearchService.webSearch(query);
374
+
375
+ const nowDate = new Date();
376
+ const record = SearchResult.from({
377
+ query,
378
+ queryDigest,
379
+ response: r,
380
+ createdAt: nowDate,
381
+ expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
382
+ });
383
+ SearchResult.save(record).catch((err) => {
384
+ this.logger.warn(`Failed to cache search result`, { err });
385
+ });
386
+
387
+ return r;
388
+ }
389
+ }
backend/functions/src/db/searched.ts ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Also, parseJSONText, Prop } from 'civkit';
2
+ import { FirestoreRecord } from '../shared/lib/firestore';
3
+ import _ from 'lodash';
4
+
5
+ @Also({
6
+ dictOf: Object
7
+ })
8
+ export class SearchResult extends FirestoreRecord {
9
+ static override collectionName = 'searchResults';
10
+
11
+ override _id!: string;
12
+
13
+ @Prop({
14
+ required: true
15
+ })
16
+ query!: any;
17
+
18
+ @Prop({
19
+ required: true
20
+ })
21
+ queryDigest!: string;
22
+
23
+ @Prop()
24
+ response?: any;
25
+
26
+ @Prop()
27
+ createdAt!: Date;
28
+
29
+ @Prop()
30
+ expireAt?: Date;
31
+
32
+ [k: string]: any;
33
+
34
+ static patchedFields = [
35
+ 'query',
36
+ 'response',
37
+ ];
38
+
39
+ static override from(input: any) {
40
+ for (const field of this.patchedFields) {
41
+ if (typeof input[field] === 'string') {
42
+ input[field] = parseJSONText(input[field]);
43
+ }
44
+ }
45
+
46
+ return super.from(input) as SearchResult;
47
+ }
48
+
49
+ override degradeForFireStore() {
50
+ const copy: any = { ...this };
51
+
52
+ for (const field of (this.constructor as typeof SearchResult).patchedFields) {
53
+ if (typeof copy[field] === 'object') {
54
+ copy[field] = JSON.stringify(copy[field]) as any;
55
+ }
56
+ }
57
+
58
+ return copy;
59
+ }
60
+ }
backend/functions/src/services/brave-search.ts ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { AsyncService, DownstreamServiceFailureError } from 'civkit';
2
+ import { singleton } from 'tsyringe';
3
+ import { Logger } from '../shared/services/logger';
4
+ import { SecretExposer } from '../shared/services/secrets';
5
+ import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
6
+ import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
7
+ import { AsyncContext } from '../shared';
8
+ import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
9
+
10
+ @singleton()
11
+ export class BraveSearchService extends AsyncService {
12
+
13
+ logger = this.globalLogger.child({ service: this.constructor.name });
14
+
15
+ braveSearchHTTP!: BraveSearchHTTP;
16
+
17
+ constructor(
18
+ protected globalLogger: Logger,
19
+ protected secretExposer: SecretExposer,
20
+ protected geoipControl: GeoIPService,
21
+ protected threadLocal: AsyncContext,
22
+ ) {
23
+ super(...arguments);
24
+ }
25
+
26
+ override async init() {
27
+ await this.dependencyReady();
28
+ this.emit('ready');
29
+
30
+ this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
31
+ }
32
+
33
+ async webSearch(query: WebSearchQueryParams) {
34
+ const ip = this.threadLocal.get('ip');
35
+ const extraHeaders: WebSearchOptionalHeaderOptions = {};
36
+ if (ip) {
37
+ const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
38
+
39
+ if (geoip?.city) {
40
+ extraHeaders['X-Loc-City'] = geoip.city;
41
+ }
42
+ if (geoip?.country) {
43
+ extraHeaders['X-Loc-Country'] = geoip.country.code;
44
+ }
45
+ if (geoip?.timezone) {
46
+ extraHeaders['X-Loc-Timezone'] = geoip.timezone;
47
+ }
48
+ if (geoip?.coordinates) {
49
+ extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`;
50
+ extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`;
51
+ }
52
+ if (geoip?.subdivisions?.length) {
53
+ extraHeaders['X-Loc-State'] = geoip.subdivisions[0].code;
54
+ extraHeaders['X-Loc-State-Name'] = geoip.subdivisions[0].name;
55
+ }
56
+ }
57
+ if (this.threadLocal.get('userAgent')) {
58
+ extraHeaders['User-Agent'] = this.threadLocal.get('userAgent');
59
+ }
60
+
61
+ try {
62
+ const r = await this.braveSearchHTTP.webSearch(query, { headers: extraHeaders as Record<string, string> });
63
+
64
+ return r.parsed;
65
+ } catch (err) {
66
+ throw new DownstreamServiceFailureError({ message: `Search failed`, cause: err });
67
+ }
68
+
69
+ }
70
+
71
+ }
backend/functions/src/services/geoip.ts ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { container, singleton } from 'tsyringe';
2
+ import fsp from 'fs/promises';
3
+ import { CityResponse, Reader } from 'maxmind';
4
+ import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
5
+ import { Logger } from '../shared';
6
+ import path from 'path';
7
+
8
+ export enum GEOIP_SUPPORTED_LANGUAGES {
9
+ EN = 'en',
10
+ ZH_CN = 'zh-CN',
11
+ JA = 'ja',
12
+ DE = 'de',
13
+ FR = 'fr',
14
+ ES = 'es',
15
+ PT_BR = 'pt-BR',
16
+ RU = 'ru',
17
+ }
18
+
19
+ export class GeoIPInfo extends AutoCastable {
20
+ @Prop()
21
+ code?: string;
22
+
23
+ @Prop()
24
+ name?: string;
25
+ }
26
+
27
+ export class GeoIPCountryInfo extends GeoIPInfo {
28
+ @Prop()
29
+ eu?: boolean;
30
+ }
31
+
32
+ export class GeoIPCityResponse extends AutoCastable {
33
+ @Prop()
34
+ continent?: GeoIPInfo;
35
+
36
+ @Prop()
37
+ country?: GeoIPCountryInfo;
38
+
39
+ @Prop({
40
+ arrayOf: GeoIPInfo
41
+ })
42
+ subdivisions?: GeoIPInfo[];
43
+
44
+ @Prop()
45
+ city?: string;
46
+
47
+ @Prop({
48
+ arrayOf: Number
49
+ })
50
+ coordinates?: [number, number, number];
51
+
52
+ @Prop()
53
+ timezone?: string;
54
+ }
55
+
56
+ @singleton()
57
+ export class GeoIPService extends AsyncService {
58
+
59
+ logger = this.globalLogger.child({ service: this.constructor.name });
60
+
61
+ mmdbCity!: Reader<CityResponse>;
62
+
63
+ constructor(
64
+ protected globalLogger: Logger,
65
+ ) {
66
+ super(...arguments);
67
+ }
68
+
69
+
70
+ override async init() {
71
+ await this.dependencyReady();
72
+
73
+ this.emit('ready');
74
+ }
75
+
76
+ @runOnce()
77
+ async _lazyload() {
78
+ const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb');
79
+
80
+ const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null });
81
+
82
+ this.mmdbCity = new Reader<CityResponse>(dbBuff);
83
+
84
+ this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`);
85
+ }
86
+
87
+
88
+ async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
89
+ await this._lazyload();
90
+
91
+ const r = this.mmdbCity.get(ip);
92
+
93
+ if (!r) {
94
+ return undefined;
95
+ }
96
+
97
+ return GeoIPCityResponse.from({
98
+ continent: r.continent ? {
99
+ code: r.continent?.code,
100
+ name: r.continent?.names?.[lang] || r.continent?.names?.en,
101
+ } : undefined,
102
+ country: r.country ? {
103
+ code: r.country?.iso_code,
104
+ name: r.country?.names?.[lang] || r.country?.names.en,
105
+ eu: r.country?.is_in_european_union,
106
+ } : undefined,
107
+ city: r.city?.names?.[lang] || r.city?.names?.en,
108
+ subdivisions: r.subdivisions?.map((x) => ({
109
+ code: x.iso_code,
110
+ name: x.names?.[lang] || x.names?.en,
111
+ })),
112
+ coordinates: r.location ? [
113
+ r.location.latitude, r.location.longitude, r.location.accuracy_radius
114
+ ] : undefined,
115
+ timezone: r.location?.time_zone,
116
+ });
117
+ }
118
+
119
+ }
120
+
121
+ const instance = container.resolve(GeoIPService);
122
+
123
+ export default instance;
backend/functions/src/services/puppeteer.ts CHANGED
@@ -278,7 +278,7 @@ document.addEventListener('load', handlePageLoad);
278
  return page;
279
  }
280
 
281
- async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
282
  // parsedUrl.search = '';
283
  const url = parsedUrl.toString();
284
 
@@ -287,10 +287,10 @@ document.addEventListener('load', handlePageLoad);
287
  let screenshot: Buffer | undefined;
288
 
289
  const page = await this.pagePool.acquire();
290
- if (options.proxyUrl) {
291
  await page.useProxy(options.proxyUrl);
292
  }
293
- if (options.cookies) {
294
  await page.setCookie(...options.cookies);
295
  }
296
 
@@ -353,7 +353,7 @@ document.addEventListener('load', handlePageLoad);
353
  yield { ...snapshot, screenshot } as PageSnapshot;
354
  break;
355
  }
356
- if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
357
  screenshot = await page.screenshot();
358
  lastHTML = snapshot.html;
359
  }
 
278
  return page;
279
  }
280
 
281
+ async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
282
  // parsedUrl.search = '';
283
  const url = parsedUrl.toString();
284
 
 
287
  let screenshot: Buffer | undefined;
288
 
289
  const page = await this.pagePool.acquire();
290
+ if (options?.proxyUrl) {
291
  await page.useProxy(options.proxyUrl);
292
  }
293
+ if (options?.cookies) {
294
  await page.setCookie(...options.cookies);
295
  }
296
 
 
353
  yield { ...snapshot, screenshot } as PageSnapshot;
354
  break;
355
  }
356
+ if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
357
  screenshot = await page.screenshot();
358
  lastHTML = snapshot.html;
359
  }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 584791b789cd483dab18735416744b4d10130993
 
1
+ Subproject commit 2f2cdcff7b2738be33ee5aca858ef2d65eba29ed