nomagick commited on
Commit
8cfd0d6
·
unverified ·
1 Parent(s): 2e025d1

feat: jina paywall (#49)

Browse files

* feat: integrate with jina embeddings paywall

backend/functions/package-lock.json CHANGED
@@ -178,6 +178,16 @@
178
  "node": ">=6.9.0"
179
  }
180
  },
 
 
 
 
 
 
 
 
 
 
181
  "node_modules/@babel/helper-compilation-targets/node_modules/semver": {
182
  "version": "6.3.1",
183
  "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
@@ -6251,6 +6261,17 @@
6251
  "node": ">=10.19.0"
6252
  }
6253
  },
 
 
 
 
 
 
 
 
 
 
 
6254
  "node_modules/https-proxy-agent": {
6255
  "version": "5.0.1",
6256
  "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
@@ -8059,16 +8080,6 @@
8059
  "node": ">=8"
8060
  }
8061
  },
8062
- "node_modules/lru-cache": {
8063
- "version": "5.1.1",
8064
- "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
8065
- "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
8066
- "dev": true,
8067
- "peer": true,
8068
- "dependencies": {
8069
- "yallist": "^3.0.2"
8070
- }
8071
- },
8072
  "node_modules/lru-memoizer": {
8073
  "version": "2.2.0",
8074
  "resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
@@ -9852,17 +9863,6 @@
9852
  "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
9853
  "optional": true
9854
  },
9855
- "node_modules/quick-lru": {
9856
- "version": "5.1.1",
9857
- "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
9858
- "integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
9859
- "engines": {
9860
- "node": ">=10"
9861
- },
9862
- "funding": {
9863
- "url": "https://github.com/sponsors/sindresorhus"
9864
- }
9865
- },
9866
  "node_modules/range-parser": {
9867
  "version": "1.2.1",
9868
  "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
 
178
  "node": ">=6.9.0"
179
  }
180
  },
181
+ "node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": {
182
+ "version": "5.1.1",
183
+ "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
184
+ "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
185
+ "dev": true,
186
+ "peer": true,
187
+ "dependencies": {
188
+ "yallist": "^3.0.2"
189
+ }
190
+ },
191
  "node_modules/@babel/helper-compilation-targets/node_modules/semver": {
192
  "version": "6.3.1",
193
  "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
 
6261
  "node": ">=10.19.0"
6262
  }
6263
  },
6264
+ "node_modules/http2-wrapper/node_modules/quick-lru": {
6265
+ "version": "5.1.1",
6266
+ "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
6267
+ "integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
6268
+ "engines": {
6269
+ "node": ">=10"
6270
+ },
6271
+ "funding": {
6272
+ "url": "https://github.com/sponsors/sindresorhus"
6273
+ }
6274
+ },
6275
  "node_modules/https-proxy-agent": {
6276
  "version": "5.0.1",
6277
  "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
 
8080
  "node": ">=8"
8081
  }
8082
  },
 
 
 
 
 
 
 
 
 
 
8083
  "node_modules/lru-memoizer": {
8084
  "version": "2.2.0",
8085
  "resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
 
9863
  "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
9864
  "optional": true
9865
  },
 
 
 
 
 
 
 
 
 
 
 
9866
  "node_modules/range-parser": {
9867
  "version": "1.2.1",
9868
  "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -5,7 +5,7 @@ import {
5
  AssertionFailureError, ParamValidationError,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
- import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@@ -19,6 +19,9 @@ import { Crawled } from '../db/crawled';
19
  import { tidyMarkdown } from '../utils/markdown';
20
  import { cleanAttribute } from '../utils/misc';
21
  import { randomUUID } from 'crypto';
 
 
 
22
 
23
  const md5Hasher = new HashManager('md5', 'hex');
24
 
@@ -296,23 +299,55 @@ ${this.content}
296
  req: Request,
297
  res: Response,
298
  },
 
299
  ) {
300
- if (ctx.req.ip) {
301
- await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'], [
302
- // 100 requests per minute
303
- new Date(Date.now() - 60 * 1000), 100
304
- ]);
305
- }
306
-
307
  const noSlashURL = ctx.req.url.slice(1);
308
  if (!noSlashURL) {
 
 
 
 
 
 
309
  return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
310
  [Homepage] https://jina.ai/reader
311
  [Source code] https://github.com/jina-ai/reader
312
- `,
313
  { contentType: 'text/plain', envelope: null }
314
  );
315
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  let urlToCrawl;
317
  try {
318
  urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
@@ -364,7 +399,7 @@ ${this.content}
364
  }
365
 
366
  const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
367
-
368
  sseStream.write({
369
  event: 'data',
370
  data: formatted,
@@ -392,6 +427,7 @@ ${this.content}
392
  }
393
 
394
  const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
 
395
 
396
  return formatted;
397
  }
@@ -400,7 +436,10 @@ ${this.content}
400
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
401
  }
402
 
403
- return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
 
 
 
404
  }
405
 
406
  for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
@@ -410,6 +449,7 @@ ${this.content}
410
  }
411
 
412
  const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
 
413
  if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
414
 
415
  return assignTransferProtocolMeta(`${formatted}`,
@@ -425,6 +465,7 @@ ${this.content}
425
  }
426
 
427
  const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
 
428
  if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
429
 
430
  return assignTransferProtocolMeta(`${formatted}`,
@@ -563,4 +604,21 @@ ${this.content}
563
  }
564
  }
565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
  }
 
5
  AssertionFailureError, ParamValidationError,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
+ import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import { RateLimitControl } from '../shared/services/rate-limit';
10
  import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
 
19
  import { tidyMarkdown } from '../utils/markdown';
20
  import { cleanAttribute } from '../utils/misc';
21
  import { randomUUID } from 'crypto';
22
+ import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
23
+
24
+ import { countGPTToken as estimateToken } from '../shared/utils/openai';
25
 
26
  const md5Hasher = new HashManager('md5', 'hex');
27
 
 
299
  req: Request,
300
  res: Response,
301
  },
302
+ auth: JinaEmbeddingsAuthDTO
303
  ) {
304
+ const uid = await auth.solveUID();
305
+ let chargeAmount = 0;
 
 
 
 
 
306
  const noSlashURL = ctx.req.url.slice(1);
307
  if (!noSlashURL) {
308
+ const latestUser = uid ? await auth.assertUser() : undefined;
309
+ const authMixin = latestUser ? `
310
+ [Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
311
+ [Balance left] ${latestUser.wallet.total_balance}
312
+ ` : '';
313
+
314
  return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
315
  [Homepage] https://jina.ai/reader
316
  [Source code] https://github.com/jina-ai/reader
317
+ ${authMixin}`,
318
  { contentType: 'text/plain', envelope: null }
319
  );
320
  }
321
+
322
+ if (uid) {
323
+ const user = await auth.assertUser();
324
+ if (!(user.wallet.total_balance > 0)) {
325
+ throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
326
+ }
327
+
328
+ await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
329
+ [
330
+ // 1000 requests per minute
331
+ new Date(Date.now() - 60 * 1000), 1000
332
+ ]
333
+ );
334
+
335
+ rpcReflect.finally(() => {
336
+ if (chargeAmount) {
337
+ auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
338
+ this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
339
+ });
340
+ }
341
+ });
342
+ } else if (ctx.req.ip) {
343
+ await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
344
+ [
345
+ // 100 requests per minute
346
+ new Date(Date.now() - 60 * 1000), 100
347
+ ]
348
+ );
349
+ }
350
+
351
  let urlToCrawl;
352
  try {
353
  urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
 
399
  }
400
 
401
  const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
402
+ chargeAmount = this.getChargeAmount(formatted);
403
  sseStream.write({
404
  event: 'data',
405
  data: formatted,
 
427
  }
428
 
429
  const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
430
+ chargeAmount = this.getChargeAmount(formatted);
431
 
432
  return formatted;
433
  }
 
436
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
437
  }
438
 
439
+ const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
440
+ chargeAmount = this.getChargeAmount(formatted);
441
+
442
+ return formatted;
443
  }
444
 
445
  for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
 
449
  }
450
 
451
  const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
452
+ chargeAmount = this.getChargeAmount(formatted);
453
  if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
454
 
455
  return assignTransferProtocolMeta(`${formatted}`,
 
465
  }
466
 
467
  const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
468
+ chargeAmount = this.getChargeAmount(formatted);
469
  if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
470
 
471
  return assignTransferProtocolMeta(`${formatted}`,
 
604
  }
605
  }
606
 
607
+ getChargeAmount(formatted: { [k: string]: any; }) {
608
+ const textContent = formatted?.content || formatted?.text || formatted?.html;
609
+
610
+ if (typeof textContent === 'string') {
611
+ return estimateToken(textContent);
612
+ }
613
+
614
+ const imageContent = formatted.screenshotUrl || formatted?.screenshot;
615
+
616
+ if (imageContent) {
617
+ // OpenAI image token count for 1024x1024 image
618
+ return 765;
619
+ }
620
+
621
+ return undefined;
622
+ }
623
+
624
  }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit d3bb3a7335ec9d96c68d1edf1b66fdf5e2fe5b7c
 
1
+ Subproject commit 584791b789cd483dab18735416744b4d10130993