mapleeit commited on
Commit
e27bcac
·
unverified ·
1 Parent(s): f8bc487

feat: add adaptive crawler (#112)

Browse files
backend/functions/package-lock.json CHANGED
@@ -35,13 +35,15 @@
35
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
36
  "puppeteer-extra-plugin-stealth": "^2.11.2",
37
  "puppeteer-page-proxy": "^1.3.0",
 
38
  "set-cookie-parser": "^2.6.0",
39
  "stripe": "^11.11.0",
40
  "tiktoken": "^1.0.10",
41
  "tld-extract": "^2.1.0",
42
  "turndown": "^7.1.3",
43
  "turndown-plugin-gfm": "^1.0.2",
44
- "undici": "^5.24.0"
 
45
  },
46
  "devDependencies": {
47
  "@types/archiver": "^5.3.4",
@@ -50,6 +52,7 @@
50
  "@types/generic-pool": "^3.8.1",
51
  "@types/node": "^20.14.13",
52
  "@types/set-cookie-parser": "^2.4.7",
 
53
  "@typescript-eslint/eslint-plugin": "^5.12.0",
54
  "@typescript-eslint/parser": "^5.12.0",
55
  "eslint": "^8.9.0",
@@ -2431,6 +2434,13 @@
2431
  "@types/webidl-conversions": "*"
2432
  }
2433
  },
 
 
 
 
 
 
 
2434
  "node_modules/@types/yargs": {
2435
  "version": "17.0.32",
2436
  "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.32.tgz",
@@ -10322,6 +10332,15 @@
10322
  "url": "https://github.com/sponsors/isaacs"
10323
  }
10324
  },
 
 
 
 
 
 
 
 
 
10325
  "node_modules/run-parallel": {
10326
  "version": "1.2.0",
10327
  "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
@@ -11928,6 +11947,15 @@
11928
  "node": ">=4.0"
11929
  }
11930
  },
 
 
 
 
 
 
 
 
 
11931
  "node_modules/y18n": {
11932
  "version": "5.0.8",
11933
  "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
 
35
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
36
  "puppeteer-extra-plugin-stealth": "^2.11.2",
37
  "puppeteer-page-proxy": "^1.3.0",
38
+ "robots-parser": "^3.0.1",
39
  "set-cookie-parser": "^2.6.0",
40
  "stripe": "^11.11.0",
41
  "tiktoken": "^1.0.10",
42
  "tld-extract": "^2.1.0",
43
  "turndown": "^7.1.3",
44
  "turndown-plugin-gfm": "^1.0.2",
45
+ "undici": "^5.24.0",
46
+ "xmldom": "^0.6.0"
47
  },
48
  "devDependencies": {
49
  "@types/archiver": "^5.3.4",
 
52
  "@types/generic-pool": "^3.8.1",
53
  "@types/node": "^20.14.13",
54
  "@types/set-cookie-parser": "^2.4.7",
55
+ "@types/xmldom": "^0.1.34",
56
  "@typescript-eslint/eslint-plugin": "^5.12.0",
57
  "@typescript-eslint/parser": "^5.12.0",
58
  "eslint": "^8.9.0",
 
2434
  "@types/webidl-conversions": "*"
2435
  }
2436
  },
2437
+ "node_modules/@types/xmldom": {
2438
+ "version": "0.1.34",
2439
+ "resolved": "https://registry.npmjs.org/@types/xmldom/-/xmldom-0.1.34.tgz",
2440
+ "integrity": "sha512-7eZFfxI9XHYjJJuugddV6N5YNeXgQE1lArWOcd1eCOKWb/FGs5SIjacSYuEJuwhsGS3gy4RuZ5EUIcqYscuPDA==",
2441
+ "dev": true,
2442
+ "license": "MIT"
2443
+ },
2444
  "node_modules/@types/yargs": {
2445
  "version": "17.0.32",
2446
  "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.32.tgz",
 
10332
  "url": "https://github.com/sponsors/isaacs"
10333
  }
10334
  },
10335
+ "node_modules/robots-parser": {
10336
+ "version": "3.0.1",
10337
+ "resolved": "https://registry.npmjs.org/robots-parser/-/robots-parser-3.0.1.tgz",
10338
+ "integrity": "sha512-s+pyvQeIKIZ0dx5iJiQk1tPLJAWln39+MI5jtM8wnyws+G5azk+dMnMX0qfbqNetKKNgcWWOdi0sfm+FbQbgdQ==",
10339
+ "license": "MIT",
10340
+ "engines": {
10341
+ "node": ">=10.0.0"
10342
+ }
10343
+ },
10344
  "node_modules/run-parallel": {
10345
  "version": "1.2.0",
10346
  "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
 
11947
  "node": ">=4.0"
11948
  }
11949
  },
11950
+ "node_modules/xmldom": {
11951
+ "version": "0.6.0",
11952
+ "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.6.0.tgz",
11953
+ "integrity": "sha512-iAcin401y58LckRZ0TkI4k0VSM1Qg0KGSc3i8rU+xrxe19A/BN1zHyVSJY7uoutVlaTSzYyk/v5AmkewAP7jtg==",
11954
+ "license": "MIT",
11955
+ "engines": {
11956
+ "node": ">=10.0.0"
11957
+ }
11958
+ },
11959
  "node_modules/y18n": {
11960
  "version": "5.0.8",
11961
  "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
backend/functions/package.json CHANGED
@@ -55,13 +55,15 @@
55
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
56
  "puppeteer-extra-plugin-stealth": "^2.11.2",
57
  "puppeteer-page-proxy": "^1.3.0",
 
58
  "set-cookie-parser": "^2.6.0",
59
  "stripe": "^11.11.0",
60
  "tiktoken": "^1.0.10",
61
  "tld-extract": "^2.1.0",
62
  "turndown": "^7.1.3",
63
  "turndown-plugin-gfm": "^1.0.2",
64
- "undici": "^5.24.0"
 
65
  },
66
  "devDependencies": {
67
  "@types/archiver": "^5.3.4",
@@ -70,6 +72,7 @@
70
  "@types/generic-pool": "^3.8.1",
71
  "@types/node": "^20.14.13",
72
  "@types/set-cookie-parser": "^2.4.7",
 
73
  "@typescript-eslint/eslint-plugin": "^5.12.0",
74
  "@typescript-eslint/parser": "^5.12.0",
75
  "eslint": "^8.9.0",
 
55
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
56
  "puppeteer-extra-plugin-stealth": "^2.11.2",
57
  "puppeteer-page-proxy": "^1.3.0",
58
+ "robots-parser": "^3.0.1",
59
  "set-cookie-parser": "^2.6.0",
60
  "stripe": "^11.11.0",
61
  "tiktoken": "^1.0.10",
62
  "tld-extract": "^2.1.0",
63
  "turndown": "^7.1.3",
64
  "turndown-plugin-gfm": "^1.0.2",
65
+ "undici": "^5.24.0",
66
+ "xmldom": "^0.6.0"
67
  },
68
  "devDependencies": {
69
  "@types/archiver": "^5.3.4",
 
72
  "@types/generic-pool": "^3.8.1",
73
  "@types/node": "^20.14.13",
74
  "@types/set-cookie-parser": "^2.4.7",
75
+ "@types/xmldom": "^0.1.34",
76
  "@typescript-eslint/eslint-plugin": "^5.12.0",
77
  "@typescript-eslint/parser": "^5.12.0",
78
  "eslint": "^8.9.0",
backend/functions/src/cloud-functions/adaptive-crawler.ts ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ assignTransferProtocolMeta,
3
+ HashManager,
4
+ RPCHost, RPCReflection,
5
+ } from 'civkit';
6
+ import { singleton } from 'tsyringe';
7
+ import { CloudHTTPv2, CloudTaskV2, Ctx, FirebaseStorageBucketControl, Logger, Param, RPCReflect } from '../shared';
8
+ import _ from 'lodash';
9
+ import { Request, Response } from 'express';
10
+ import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
11
+ import robotsParser from 'robots-parser';
12
+ import { DOMParser } from 'xmldom';
13
+
14
+ import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
15
+ import { CrawlerOptions } from '../dto/scrapping-options';
16
+ import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
17
+ import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
18
+ import { getFunctions } from 'firebase-admin/functions';
19
+ import { getFunctionUrl } from '../utils/get-function-url';
20
+ import { Timestamp } from 'firebase-admin/firestore';
21
+
22
+ const md5Hasher = new HashManager('md5', 'hex');
23
+ const removeURLHash = (url: string) => {
24
+ const o = new URL(url);
25
+ o.hash = '';
26
+ return o.toString();
27
+ }
28
+
29
+ @singleton()
30
+ export class AdaptiveCrawlerHost extends RPCHost {
31
+ logger = this.globalLogger.child({ service: this.constructor.name });
32
+
33
+ static readonly __singleCrawlQueueName = 'singleCrawlQueue';
34
+
35
+ constructor(
36
+ protected globalLogger: Logger,
37
+ protected firebaseObjectStorage: FirebaseStorageBucketControl,
38
+ ) {
39
+ super(...arguments);
40
+ }
41
+
42
+ override async init() {
43
+ await this.dependencyReady();
44
+
45
+ this.emit('ready');
46
+ }
47
+
48
+ @CloudHTTPv2({
49
+ runtime: {
50
+ memory: '1GiB',
51
+ timeoutSeconds: 300,
52
+ concurrency: 22,
53
+ },
54
+ tags: ['Crawler'],
55
+ httpMethod: ['post', 'get'],
56
+ returnType: [String],
57
+ })
58
+ async adaptiveCrawl(
59
+ @RPCReflect() rpcReflect: RPCReflection,
60
+ @Ctx() ctx: {
61
+ req: Request,
62
+ res: Response,
63
+ },
64
+ auth: JinaEmbeddingsAuthDTO,
65
+ crawlerOptions: CrawlerOptions,
66
+ adaptiveCrawlerOptions: AdaptiveCrawlerOptions,
67
+ ) {
68
+ this.logger.debug({
69
+ adaptiveCrawlerOptions,
70
+ crawlerOptions,
71
+ });
72
+
73
+
74
+ const uid = await auth.solveUID();
75
+ const { useSitemap, maxPages } = adaptiveCrawlerOptions;
76
+
77
+ let tmpUrl = ctx.req.url.slice(1)?.trim();
78
+ if (!tmpUrl) {
79
+ tmpUrl = crawlerOptions.url?.trim() ?? '';
80
+ }
81
+ const targetUrl = new URL(tmpUrl);
82
+
83
+ if (!targetUrl) {
84
+ const latestUser = uid ? await auth.assertUser() : undefined;
85
+ if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
86
+ return this.getIndex(latestUser);
87
+ }
88
+
89
+ return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
90
+ { contentType: 'text/plain', envelope: null }
91
+ );
92
+ }
93
+
94
+ const meta = {
95
+ targetUrl: targetUrl.toString(),
96
+ useSitemap,
97
+ maxPages,
98
+ };
99
+
100
+ const digest = md5Hasher.hash(JSON.stringify(meta));
101
+ const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
102
+ const existing = await AdaptiveCrawlTask.fromFirestore(shortDigest);
103
+
104
+ if (existing) {
105
+ return { taskId: shortDigest };
106
+ }
107
+
108
+ await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
109
+ _id: shortDigest,
110
+ status: AdaptiveCrawlTaskStatus.PENDING,
111
+ statusText: 'Pending',
112
+ meta,
113
+ createdAt: new Date(),
114
+ urls: [],
115
+ processed: {},
116
+ failed: {},
117
+ });
118
+
119
+ if (useSitemap) {
120
+ const urls = await this.crawlUrlsFromSitemap(targetUrl, maxPages);
121
+
122
+ await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({
123
+ status: AdaptiveCrawlTaskStatus.PROCESSING,
124
+ statusText: `Processing 0/${urls.length}`,
125
+ urls,
126
+ });
127
+
128
+ const promises = [];
129
+ for (const url of urls) {
130
+ promises.push(getFunctions().taskQueue(AdaptiveCrawlerHost.__singleCrawlQueueName).enqueue({
131
+ shortDigest, url, token: auth.bearerToken, meta
132
+ }, {
133
+ dispatchDeadlineSeconds: 1800,
134
+ uri: await getFunctionUrl(AdaptiveCrawlerHost.__singleCrawlQueueName),
135
+ }));
136
+ };
137
+
138
+ await Promise.all(promises);
139
+ } else {
140
+ await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({
141
+ urls: [targetUrl.toString()],
142
+ });
143
+
144
+ await getFunctions().taskQueue(AdaptiveCrawlerHost.__singleCrawlQueueName).enqueue({
145
+ shortDigest, url: targetUrl.toString(), token: auth.bearerToken, meta
146
+ }, {
147
+ dispatchDeadlineSeconds: 1800,
148
+ uri: await getFunctionUrl(AdaptiveCrawlerHost.__singleCrawlQueueName),
149
+ })
150
+ }
151
+
152
+ return { taskId: shortDigest };
153
+ }
154
+
155
+ @CloudHTTPv2({
156
+ runtime: {
157
+ memory: '1GiB',
158
+ timeoutSeconds: 300,
159
+ concurrency: 22,
160
+ },
161
+ tags: ['Crawler'],
162
+ httpMethod: ['post', 'get'],
163
+ returnType: AdaptiveCrawlTask,
164
+ })
165
+ async adaptiveCrawlStatus(
166
+ @RPCReflect() rpcReflect: RPCReflection,
167
+ @Ctx() ctx: {
168
+ req: Request,
169
+ res: Response,
170
+ },
171
+ auth: JinaEmbeddingsAuthDTO,
172
+ @Param('taskId') taskId: string,
173
+ @Param('urls') urls: string[] = [],
174
+ ) {
175
+ if (!taskId) {
176
+ throw new Error('taskId is required');
177
+ }
178
+
179
+ const state = await AdaptiveCrawlTask.fromFirestore(taskId);
180
+
181
+ if (urls.length) {
182
+ const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
183
+ if (urls.includes(url)) {
184
+ const raw = await this.firebaseObjectStorage.downloadFile(cachePath);
185
+ state!.processed[url] = JSON.parse(raw.toString('utf-8'));
186
+ }
187
+ });
188
+
189
+ await Promise.all(promises);
190
+ }
191
+
192
+
193
+ return state;
194
+ }
195
+
196
+ @CloudTaskV2({
197
+ name: AdaptiveCrawlerHost.__singleCrawlQueueName,
198
+ runtime: {
199
+ cpu: 1,
200
+ memory: '1GiB',
201
+ timeoutSeconds: 3600,
202
+ concurrency: 2,
203
+ maxInstances: 200,
204
+ retryConfig: {
205
+ maxAttempts: 3,
206
+ minBackoffSeconds: 60,
207
+ },
208
+ rateLimits: {
209
+ maxConcurrentDispatches: 150,
210
+ maxDispatchesPerSecond: 5,
211
+ },
212
+ }
213
+ })
214
+ async singleCrawlQueue(
215
+ @Param('shortDigest') shortDigest: string,
216
+ @Param('url') url: string,
217
+ @Param('token') token: string,
218
+ @Param('meta') meta: AdaptiveCrawlTask['meta'],
219
+ ) {
220
+ const error = {
221
+ reason: ''
222
+ };
223
+
224
+ const state = await AdaptiveCrawlTask.fromFirestore(shortDigest);
225
+ if (state?.status === AdaptiveCrawlTaskStatus.COMPLETED) {
226
+ return;
227
+ }
228
+
229
+ try {
230
+ url = removeURLHash(url);
231
+ } catch(e) {
232
+ error.reason = `Failed to parse url: ${url}`;
233
+ }
234
+
235
+ this.logger.debug(shortDigest, url, meta);
236
+ const cachePath = `adaptive-crawl-task/${shortDigest}/${md5Hasher.hash(url)}`;
237
+
238
+ if (!error.reason) {
239
+ const result = meta.useSitemap
240
+ ? await this.handleSingleCrawl(shortDigest, url, token, cachePath)
241
+ : await this.handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath);
242
+
243
+ if (!result) {
244
+ return;
245
+ }
246
+
247
+ error.reason = result.error.reason;
248
+ }
249
+
250
+ await AdaptiveCrawlTask.DB.runTransaction(async (transaction) => {
251
+ const ref = AdaptiveCrawlTask.COLLECTION.doc(shortDigest);
252
+ const state = await transaction.get(ref);
253
+ const data = state.data() as AdaptiveCrawlTask & { createdAt: Timestamp };
254
+
255
+ if (error.reason) {
256
+ data.failed[url] = error;
257
+ } else {
258
+ data.processed[url] = cachePath;
259
+ }
260
+
261
+ const status = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length
262
+ ? AdaptiveCrawlTaskStatus.COMPLETED : AdaptiveCrawlTaskStatus.PROCESSING;
263
+ const statusText = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length
264
+ ? `Completed ${Object.keys(data.processed).length} Succeeded, ${Object.keys(data.failed).length} Failed`
265
+ : `Processing ${Object.keys(data.processed).length + Object.keys(data.failed).length}/${data.urls.length}`;
266
+
267
+ const payload: Partial<AdaptiveCrawlTask> = {
268
+ status,
269
+ statusText,
270
+ processed: data.processed,
271
+ failed: data.failed,
272
+ };
273
+
274
+ if (status === AdaptiveCrawlTaskStatus.COMPLETED) {
275
+ payload.finishedAt = new Date();
276
+ payload.duration = new Date().getTime() - data.createdAt.toDate().getTime();
277
+ }
278
+
279
+ transaction.update(ref, payload);
280
+ });
281
+ }
282
+
283
+ async handleSingleCrawl(shortDigest: string, url: string, token: string, cachePath: string) {
284
+ const error = {
285
+ reason: ''
286
+ }
287
+
288
+ const response = await fetch('https://r.jina.ai', {
289
+ method: 'POST',
290
+ headers: {
291
+ 'Content-Type': 'application/json',
292
+ 'Authorization': `Bearer ${token}`,
293
+ 'Accept': 'application/json',
294
+ },
295
+ body: JSON.stringify({ url })
296
+ })
297
+
298
+ if (!response.ok) {
299
+ error.reason = `Failed to crawl ${url}, ${response.statusText}`;
300
+ } else {
301
+ const json = await response.json();
302
+
303
+ await this.firebaseObjectStorage.saveFile(cachePath,
304
+ Buffer.from(
305
+ JSON.stringify(json),
306
+ 'utf-8'
307
+ ),
308
+ {
309
+ metadata: {
310
+ contentType: 'application/json',
311
+ }
312
+ }
313
+ )
314
+ }
315
+
316
+ return {
317
+ error,
318
+ }
319
+ }
320
+
321
+ async handleSingleCrawlRecursively(
322
+ shortDigest: string, url: string, token: string, meta: AdaptiveCrawlTask['meta'], cachePath: string
323
+ ) {
324
+ const error = {
325
+ reason: ''
326
+ }
327
+ const response = await fetch('https://r.jina.ai', {
328
+ method: 'POST',
329
+ headers: {
330
+ 'Content-Type': 'application/json',
331
+ 'Authorization': `Bearer ${token}`,
332
+ 'Accept': 'application/json',
333
+ 'X-With-Links-Summary': 'true',
334
+ },
335
+ body: JSON.stringify({ url })
336
+ });
337
+
338
+ if (!response.ok) {
339
+ error.reason = `Failed to crawl ${url}, ${response.statusText}`;
340
+ } else {
341
+ const json = await response.json();
342
+ await this.firebaseObjectStorage.saveFile(cachePath,
343
+ Buffer.from(
344
+ JSON.stringify(json),
345
+ 'utf-8'
346
+ ),
347
+ {
348
+ metadata: {
349
+ contentType: 'application/json',
350
+ }
351
+ }
352
+ )
353
+
354
+ const title = json.data.title;
355
+ const description = json.data.description;
356
+ const rerankQuery = `TITLE: ${title}; DESCRIPTION: ${description}`;
357
+ const links = json.data.links as Record<string, string>;
358
+
359
+ const relevantUrls = await this.getRelevantUrls(token, { query: rerankQuery, links });
360
+ this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
361
+
362
+ for (const url of relevantUrls) {
363
+ let abortContinue = false;
364
+ let abortBreak = false;
365
+ await AdaptiveCrawlTask.DB.runTransaction(async (transaction) => {
366
+ const ref = AdaptiveCrawlTask.COLLECTION.doc(shortDigest);
367
+ const state = await transaction.get(ref);
368
+ const data = state.data() as AdaptiveCrawlTask & { createdAt: Timestamp };
369
+
370
+ if (data.urls.includes(url)) {
371
+ this.logger.debug('Recursive CONTINUE', data);
372
+ abortContinue = true;
373
+ return;
374
+ }
375
+
376
+ const urls = [
377
+ ...data.urls,
378
+ url
379
+ ];
380
+
381
+ if (urls.length > meta.maxPages || data.status === AdaptiveCrawlTaskStatus.COMPLETED) {
382
+ this.logger.debug('Recursive BREAK', data);
383
+ abortBreak = true;
384
+ return;
385
+ }
386
+
387
+ transaction.update(ref, { urls });
388
+ });
389
+
390
+ if (abortContinue) {
391
+ continue;
392
+ }
393
+ if (abortBreak) {
394
+ break;
395
+ }
396
+
397
+ await getFunctions().taskQueue(AdaptiveCrawlerHost.__singleCrawlQueueName).enqueue({
398
+ shortDigest, url, token, meta
399
+ }, {
400
+ dispatchDeadlineSeconds: 1800,
401
+ uri: await getFunctionUrl(AdaptiveCrawlerHost.__singleCrawlQueueName),
402
+ });
403
+ };
404
+ }
405
+
406
+ return {
407
+ error,
408
+ }
409
+ }
410
+
411
+ async getRelevantUrls(token: string, {
412
+ query, links
413
+ }: {
414
+ query: string;
415
+ links: Record<string, string>;
416
+ }) {
417
+ const data = {
418
+ model: 'jina-reranker-v2-base-multilingual',
419
+ query,
420
+ top_n: 15,
421
+ documents: Object.entries(links).map(([title, link]) => link)
422
+ };
423
+
424
+ const response = await fetch('https://api.jina.ai/v1/rerank', {
425
+ method: 'POST',
426
+ headers: {
427
+ 'Content-Type': 'application/json',
428
+ 'Authorization': `Bearer ${token}`
429
+ },
430
+ body: JSON.stringify(data)
431
+ });
432
+
433
+ const json = (await response.json()) as {
434
+ results: {
435
+ index: number;
436
+ document: {
437
+ text: string;
438
+ };
439
+ relevance_score: number;
440
+ }[];
441
+ };
442
+
443
+ return json.results.filter(r => r.relevance_score > 0.3).map(r => r.document.text);
444
+ }
445
+
446
+ getIndex(user?: JinaEmbeddingsTokenAccount) {
447
+ // TODO: 需要更新使用方式
448
+ // const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
449
+
450
+ // Object.assign(indexObject, {
451
+ // usage1: 'https://r.jina.ai/YOUR_URL',
452
+ // usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
453
+ // homepage: 'https://jina.ai/reader',
454
+ // sourceCode: 'https://github.com/jina-ai/reader',
455
+ // });
456
+
457
+ // if (user) {
458
+ // indexObject[''] = undefined;
459
+ // indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
460
+ // indexObject.balanceLeft = user.wallet.total_balance;
461
+ // }
462
+
463
+ // return indexObject;
464
+ }
465
+
466
+ async crawlUrlsFromSitemap(url: URL, maxPages: number) {
467
+ const sitemapsFromRobotsTxt = await this.getSitemapsFromRobotsTxt(url);
468
+
469
+ const initialSitemaps: string[] = [];
470
+ if (sitemapsFromRobotsTxt === null) {
471
+ initialSitemaps.push(`${url.origin}/sitemap.xml`);
472
+ } else {
473
+ initialSitemaps.push(...sitemapsFromRobotsTxt);
474
+ }
475
+
476
+
477
+ const allUrls: Set<string> = new Set();
478
+ const processedSitemaps: Set<string> = new Set();
479
+
480
+ const fetchSitemapUrls = async (sitemapUrl: string) => {
481
+ sitemapUrl = sitemapUrl.trim();
482
+
483
+ if (processedSitemaps.has(sitemapUrl)) {
484
+ return;
485
+ }
486
+
487
+ processedSitemaps.add(sitemapUrl);
488
+
489
+ try {
490
+ const response = await fetch(sitemapUrl);
491
+ const sitemapContent = await response.text();
492
+ const parser = new DOMParser();
493
+ const xmlDoc = parser.parseFromString(sitemapContent, 'text/xml');
494
+
495
+ // handle normal sitemap
496
+ const urlElements = xmlDoc.getElementsByTagName('url');
497
+ for (let i = 0; i < urlElements.length; i++) {
498
+ const locElement = urlElements[i].getElementsByTagName('loc')[0];
499
+ if (locElement) {
500
+ const loc = locElement.textContent?.trim() || '';
501
+ if (loc.startsWith(url.origin) && !loc.endsWith('.xml')) {
502
+ allUrls.add(removeURLHash(loc));
503
+ }
504
+ if (allUrls.size >= maxPages) {
505
+ return;
506
+ }
507
+ }
508
+ }
509
+
510
+ // handle sitemap index
511
+ const sitemapElements = xmlDoc.getElementsByTagName('sitemap');
512
+ for (let i = 0; i < sitemapElements.length; i++) {
513
+ const locElement = sitemapElements[i].getElementsByTagName('loc')[0];
514
+ if (locElement) {
515
+ await fetchSitemapUrls(locElement.textContent?.trim() || '');
516
+ if (allUrls.size >= maxPages) {
517
+ return;
518
+ }
519
+ }
520
+ }
521
+ } catch (error) {
522
+ this.logger.error(`Error fetching sitemap ${sitemapUrl}:`, error);
523
+ }
524
+ };
525
+
526
+ for (const sitemapUrl of initialSitemaps) {
527
+ await fetchSitemapUrls(sitemapUrl);
528
+ if (allUrls.size >= maxPages) {
529
+ break;
530
+ }
531
+ }
532
+
533
+ const urlsToProcess = Array.from(allUrls).slice(0, maxPages);
534
+
535
+ return urlsToProcess;
536
+ }
537
+
538
+ async getSitemapsFromRobotsTxt(url: URL) {
539
+ const hostname = url.origin;
540
+ const robotsUrl = `${hostname}/robots.txt`;
541
+ const response = await fetch(robotsUrl);
542
+ if (response.status === 404) {
543
+ return null;
544
+ }
545
+ const robotsTxt = await response.text();
546
+ if (robotsTxt.length) {
547
+ const robot = robotsParser(robotsUrl, robotsTxt);
548
+ return robot.getSitemaps();
549
+ }
550
+
551
+ return null;
552
+ }
553
+ }
backend/functions/src/cloud-functions/data-crunching.ts CHANGED
@@ -17,34 +17,11 @@ import { createReadStream } from 'fs';
17
  import { appendFile } from 'fs/promises';
18
  import { createGzip } from 'zlib';
19
  import { getFunctions } from 'firebase-admin/functions';
20
- import { GoogleAuth } from 'google-auth-library';
21
  import { SnapshotFormatter } from '../services/snapshot-formatter';
 
22
 
23
  dayjs.extend(require('dayjs/plugin/utc'));
24
 
25
- /**
26
- * Get the URL of a given v2 cloud function.
27
- *
28
- * @param {string} name the function's name
29
- * @param {string} location the function's location
30
- * @return {Promise<string>} The URL of the function
31
- */
32
- async function getFunctionUrl(name: string, location = "us-central1") {
33
- const projectId = `reader-6b7dc`;
34
- const url = "https://cloudfunctions.googleapis.com/v2beta/" +
35
- `projects/${projectId}/locations/${location}/functions/${name}`;
36
- const auth = new GoogleAuth({
37
- scopes: 'https://www.googleapis.com/auth/cloud-platform',
38
- });
39
- const client = await auth.getClient();
40
- const res = await client.request<any>({ url });
41
- const uri = res.data?.serviceConfig?.uri;
42
- if (!uri) {
43
- throw new Error(`Unable to retreive uri for function at ${url}`);
44
- }
45
- return uri;
46
- }
47
-
48
  @singleton()
49
  export class DataCrunchingHost extends RPCHost {
50
  logger = this.globalLogger.child({ service: this.constructor.name });
 
17
  import { appendFile } from 'fs/promises';
18
  import { createGzip } from 'zlib';
19
  import { getFunctions } from 'firebase-admin/functions';
 
20
  import { SnapshotFormatter } from '../services/snapshot-formatter';
21
+ import { getFunctionUrl } from '../utils/get-function-url';
22
 
23
  dayjs.extend(require('dayjs/plugin/utc'));
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  @singleton()
26
  export class DataCrunchingHost extends RPCHost {
27
  logger = this.globalLogger.child({ service: this.constructor.name });
backend/functions/src/db/adaptive-crawl-task.ts ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Also, Prop, parseJSONText } from 'civkit';
2
+ import { FirestoreRecord } from '../shared/lib/firestore';
3
+ import _ from 'lodash';
4
+
5
+ export enum AdaptiveCrawlTaskStatus {
6
+ PENDING = 'pending',
7
+ PROCESSING = 'processing',
8
+ COMPLETED = 'completed',
9
+ FAILED = 'failed',
10
+ }
11
+
12
+ @Also({
13
+ dictOf: Object
14
+ })
15
+ export class AdaptiveCrawlTask extends FirestoreRecord {
16
+ static override collectionName = 'adaptiveCrawlTasks';
17
+
18
+ override _id!: string;
19
+
20
+ @Prop({
21
+ required: true
22
+ })
23
+ status!: AdaptiveCrawlTaskStatus;
24
+
25
+ @Prop({
26
+ required: true
27
+ })
28
+ statusText!: string;
29
+
30
+ @Prop()
31
+ meta!: {
32
+ useSitemap: boolean;
33
+ maxPages: number;
34
+ targetUrl: string;
35
+ };
36
+
37
+ @Prop()
38
+ urls!: string[];
39
+
40
+ @Prop()
41
+ processed!: {
42
+ [url: string]: string;
43
+ };
44
+
45
+ @Prop()
46
+ failed!: {
47
+ [url: string]: any;
48
+ };
49
+
50
+ @Prop()
51
+ createdAt!: Date;
52
+
53
+ @Prop()
54
+ finishedAt?: Date;
55
+
56
+ @Prop()
57
+ duration?: number;
58
+
59
+ static patchedFields = [
60
+ 'meta',
61
+ ];
62
+
63
+ static override from(input: any) {
64
+ for (const field of this.patchedFields) {
65
+ if (typeof input[field] === 'string') {
66
+ input[field] = parseJSONText(input[field]);
67
+ }
68
+ }
69
+
70
+ return super.from(input) as AdaptiveCrawlTask;
71
+ }
72
+
73
+ override degradeForFireStore() {
74
+ const copy: any = { ...this };
75
+
76
+ for (const field of (this.constructor as typeof AdaptiveCrawlTask).patchedFields) {
77
+ if (typeof copy[field] === 'object') {
78
+ copy[field] = JSON.stringify(copy[field]) as any;
79
+ }
80
+ }
81
+
82
+ return copy;
83
+ }
84
+
85
+ [k: string]: any;
86
+ }
backend/functions/src/dto/adaptive-crawler-options.ts ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit';
2
+ import type { Request, Response } from 'express';
3
+
4
+
5
+ @Also({
6
+ openapi: {
7
+ operation: {
8
+ parameters: {
9
+ 'X-Use-Sitemap': {
10
+ description: 'Use sitemap to crawl the website.',
11
+ in: 'header',
12
+ schema: { type: 'string' }
13
+ },
14
+ 'X-Max-Depth': {
15
+ description: 'Max deep level to crawl.',
16
+ in: 'header',
17
+ schema: { type: 'string' }
18
+ },
19
+ 'X-Max-Pages': {
20
+ description: 'Max number of pages to crawl.',
21
+ in: 'header',
22
+ schema: { type: 'string' }
23
+ },
24
+ }
25
+ }
26
+ }
27
+ })
28
+ export class AdaptiveCrawlerOptions extends AutoCastable {
29
+ @Prop({
30
+ default: true,
31
+ desc: 'Use sitemap to crawl the website.',
32
+ })
33
+ useSitemap!: boolean;
34
+
35
+ @Prop({
36
+ default: 10,
37
+ desc: 'Max number of pages to crawl.',
38
+ validate: (v: number) => v >= 1 && v <= 100,
39
+ })
40
+ maxPages!: number;
41
+
42
+ static override from(input: any) {
43
+ const instance = super.from(input) as AdaptiveCrawlerOptions;
44
+ const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
45
+ req: Request,
46
+ res: Response,
47
+ } | undefined;
48
+
49
+ let maxPages = parseInt(ctx?.req.get('x-max-pages') || '');
50
+ if (!isNaN(maxPages) && maxPages > 0) {
51
+ instance.maxPages = maxPages <= 100 ? maxPages : 100;
52
+ }
53
+
54
+ const useSitemap = ctx?.req.get('x-use-sitemap');
55
+ if (useSitemap !== undefined) {
56
+ instance.useSitemap = Boolean(useSitemap);
57
+ }
58
+
59
+ return instance;
60
+ }
61
+ }
backend/functions/src/utils/get-function-url.ts ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { GoogleAuth } from 'google-auth-library';
2
+
3
+ /**
4
+ * Get the URL of a given v2 cloud function.
5
+ *
6
+ * @param {string} name the function's name
7
+ * @param {string} location the function's location
8
+ * @return {Promise<string>} The URL of the function
9
+ */
10
+ export async function getFunctionUrl(name: string, location = "us-central1") {
11
+ const projectId = `reader-6b7dc`;
12
+ const url = "https://cloudfunctions.googleapis.com/v2beta/" +
13
+ `projects/${projectId}/locations/${location}/functions/${name}`;
14
+ const auth = new GoogleAuth({
15
+ scopes: 'https://www.googleapis.com/auth/cloud-platform',
16
+ });
17
+ const client = await auth.getClient();
18
+ const res = await client.request<any>({ url });
19
+ const uri = res.data?.serviceConfig?.uri;
20
+ if (!uri) {
21
+ throw new Error(`Unable to retreive uri for function at ${url}`);
22
+ }
23
+ return uri;
24
+ }