Spaces:
Build error
Build error
feat: jina paywall (#49)
Browse files* feat: integrate with jina embeddings paywall
backend/functions/package-lock.json
CHANGED
|
@@ -178,6 +178,16 @@
|
|
| 178 |
"node": ">=6.9.0"
|
| 179 |
}
|
| 180 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
"node_modules/@babel/helper-compilation-targets/node_modules/semver": {
|
| 182 |
"version": "6.3.1",
|
| 183 |
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
|
|
@@ -6251,6 +6261,17 @@
|
|
| 6251 |
"node": ">=10.19.0"
|
| 6252 |
}
|
| 6253 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6254 |
"node_modules/https-proxy-agent": {
|
| 6255 |
"version": "5.0.1",
|
| 6256 |
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
|
|
@@ -8059,16 +8080,6 @@
|
|
| 8059 |
"node": ">=8"
|
| 8060 |
}
|
| 8061 |
},
|
| 8062 |
-
"node_modules/lru-cache": {
|
| 8063 |
-
"version": "5.1.1",
|
| 8064 |
-
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
| 8065 |
-
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
|
| 8066 |
-
"dev": true,
|
| 8067 |
-
"peer": true,
|
| 8068 |
-
"dependencies": {
|
| 8069 |
-
"yallist": "^3.0.2"
|
| 8070 |
-
}
|
| 8071 |
-
},
|
| 8072 |
"node_modules/lru-memoizer": {
|
| 8073 |
"version": "2.2.0",
|
| 8074 |
"resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
|
|
@@ -9852,17 +9863,6 @@
|
|
| 9852 |
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
|
| 9853 |
"optional": true
|
| 9854 |
},
|
| 9855 |
-
"node_modules/quick-lru": {
|
| 9856 |
-
"version": "5.1.1",
|
| 9857 |
-
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
|
| 9858 |
-
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
|
| 9859 |
-
"engines": {
|
| 9860 |
-
"node": ">=10"
|
| 9861 |
-
},
|
| 9862 |
-
"funding": {
|
| 9863 |
-
"url": "https://github.com/sponsors/sindresorhus"
|
| 9864 |
-
}
|
| 9865 |
-
},
|
| 9866 |
"node_modules/range-parser": {
|
| 9867 |
"version": "1.2.1",
|
| 9868 |
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
|
|
|
| 178 |
"node": ">=6.9.0"
|
| 179 |
}
|
| 180 |
},
|
| 181 |
+
"node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": {
|
| 182 |
+
"version": "5.1.1",
|
| 183 |
+
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
| 184 |
+
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
|
| 185 |
+
"dev": true,
|
| 186 |
+
"peer": true,
|
| 187 |
+
"dependencies": {
|
| 188 |
+
"yallist": "^3.0.2"
|
| 189 |
+
}
|
| 190 |
+
},
|
| 191 |
"node_modules/@babel/helper-compilation-targets/node_modules/semver": {
|
| 192 |
"version": "6.3.1",
|
| 193 |
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
|
|
|
|
| 6261 |
"node": ">=10.19.0"
|
| 6262 |
}
|
| 6263 |
},
|
| 6264 |
+
"node_modules/http2-wrapper/node_modules/quick-lru": {
|
| 6265 |
+
"version": "5.1.1",
|
| 6266 |
+
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
|
| 6267 |
+
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
|
| 6268 |
+
"engines": {
|
| 6269 |
+
"node": ">=10"
|
| 6270 |
+
},
|
| 6271 |
+
"funding": {
|
| 6272 |
+
"url": "https://github.com/sponsors/sindresorhus"
|
| 6273 |
+
}
|
| 6274 |
+
},
|
| 6275 |
"node_modules/https-proxy-agent": {
|
| 6276 |
"version": "5.0.1",
|
| 6277 |
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
|
|
|
|
| 8080 |
"node": ">=8"
|
| 8081 |
}
|
| 8082 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8083 |
"node_modules/lru-memoizer": {
|
| 8084 |
"version": "2.2.0",
|
| 8085 |
"resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
|
|
|
|
| 9863 |
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
|
| 9864 |
"optional": true
|
| 9865 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9866 |
"node_modules/range-parser": {
|
| 9867 |
"version": "1.2.1",
|
| 9868 |
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
| 5 |
AssertionFailureError, ParamValidationError,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
-
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
@@ -19,6 +19,9 @@ import { Crawled } from '../db/crawled';
|
|
| 19 |
import { tidyMarkdown } from '../utils/markdown';
|
| 20 |
import { cleanAttribute } from '../utils/misc';
|
| 21 |
import { randomUUID } from 'crypto';
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 24 |
|
|
@@ -296,23 +299,55 @@ ${this.content}
|
|
| 296 |
req: Request,
|
| 297 |
res: Response,
|
| 298 |
},
|
|
|
|
| 299 |
) {
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
// 100 requests per minute
|
| 303 |
-
new Date(Date.now() - 60 * 1000), 100
|
| 304 |
-
]);
|
| 305 |
-
}
|
| 306 |
-
|
| 307 |
const noSlashURL = ctx.req.url.slice(1);
|
| 308 |
if (!noSlashURL) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
|
| 310 |
[Homepage] https://jina.ai/reader
|
| 311 |
[Source code] https://github.com/jina-ai/reader
|
| 312 |
-
`,
|
| 313 |
{ contentType: 'text/plain', envelope: null }
|
| 314 |
);
|
| 315 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
let urlToCrawl;
|
| 317 |
try {
|
| 318 |
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
|
@@ -364,7 +399,7 @@ ${this.content}
|
|
| 364 |
}
|
| 365 |
|
| 366 |
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
| 367 |
-
|
| 368 |
sseStream.write({
|
| 369 |
event: 'data',
|
| 370 |
data: formatted,
|
|
@@ -392,6 +427,7 @@ ${this.content}
|
|
| 392 |
}
|
| 393 |
|
| 394 |
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
|
|
|
| 395 |
|
| 396 |
return formatted;
|
| 397 |
}
|
|
@@ -400,7 +436,10 @@ ${this.content}
|
|
| 400 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 401 |
}
|
| 402 |
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
| 404 |
}
|
| 405 |
|
| 406 |
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
|
@@ -410,6 +449,7 @@ ${this.content}
|
|
| 410 |
}
|
| 411 |
|
| 412 |
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
|
|
|
| 413 |
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 414 |
|
| 415 |
return assignTransferProtocolMeta(`${formatted}`,
|
|
@@ -425,6 +465,7 @@ ${this.content}
|
|
| 425 |
}
|
| 426 |
|
| 427 |
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
|
|
|
| 428 |
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 429 |
|
| 430 |
return assignTransferProtocolMeta(`${formatted}`,
|
|
@@ -563,4 +604,21 @@ ${this.content}
|
|
| 563 |
}
|
| 564 |
}
|
| 565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
}
|
|
|
|
| 5 |
AssertionFailureError, ParamValidationError,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
| 8 |
+
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 9 |
import { RateLimitControl } from '../shared/services/rate-limit';
|
| 10 |
import _ from 'lodash';
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
|
|
| 19 |
import { tidyMarkdown } from '../utils/markdown';
|
| 20 |
import { cleanAttribute } from '../utils/misc';
|
| 21 |
import { randomUUID } from 'crypto';
|
| 22 |
+
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 23 |
+
|
| 24 |
+
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 25 |
|
| 26 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 27 |
|
|
|
|
| 299 |
req: Request,
|
| 300 |
res: Response,
|
| 301 |
},
|
| 302 |
+
auth: JinaEmbeddingsAuthDTO
|
| 303 |
) {
|
| 304 |
+
const uid = await auth.solveUID();
|
| 305 |
+
let chargeAmount = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
const noSlashURL = ctx.req.url.slice(1);
|
| 307 |
if (!noSlashURL) {
|
| 308 |
+
const latestUser = uid ? await auth.assertUser() : undefined;
|
| 309 |
+
const authMixin = latestUser ? `
|
| 310 |
+
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
|
| 311 |
+
[Balance left] ${latestUser.wallet.total_balance}
|
| 312 |
+
` : '';
|
| 313 |
+
|
| 314 |
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
|
| 315 |
[Homepage] https://jina.ai/reader
|
| 316 |
[Source code] https://github.com/jina-ai/reader
|
| 317 |
+
${authMixin}`,
|
| 318 |
{ contentType: 'text/plain', envelope: null }
|
| 319 |
);
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
if (uid) {
|
| 323 |
+
const user = await auth.assertUser();
|
| 324 |
+
if (!(user.wallet.total_balance > 0)) {
|
| 325 |
+
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
| 329 |
+
[
|
| 330 |
+
// 1000 requests per minute
|
| 331 |
+
new Date(Date.now() - 60 * 1000), 1000
|
| 332 |
+
]
|
| 333 |
+
);
|
| 334 |
+
|
| 335 |
+
rpcReflect.finally(() => {
|
| 336 |
+
if (chargeAmount) {
|
| 337 |
+
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
|
| 338 |
+
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
| 339 |
+
});
|
| 340 |
+
}
|
| 341 |
+
});
|
| 342 |
+
} else if (ctx.req.ip) {
|
| 343 |
+
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
| 344 |
+
[
|
| 345 |
+
// 100 requests per minute
|
| 346 |
+
new Date(Date.now() - 60 * 1000), 100
|
| 347 |
+
]
|
| 348 |
+
);
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
let urlToCrawl;
|
| 352 |
try {
|
| 353 |
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
|
|
|
| 399 |
}
|
| 400 |
|
| 401 |
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
| 402 |
+
chargeAmount = this.getChargeAmount(formatted);
|
| 403 |
sseStream.write({
|
| 404 |
event: 'data',
|
| 405 |
data: formatted,
|
|
|
|
| 427 |
}
|
| 428 |
|
| 429 |
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
| 430 |
+
chargeAmount = this.getChargeAmount(formatted);
|
| 431 |
|
| 432 |
return formatted;
|
| 433 |
}
|
|
|
|
| 436 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 437 |
}
|
| 438 |
|
| 439 |
+
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
| 440 |
+
chargeAmount = this.getChargeAmount(formatted);
|
| 441 |
+
|
| 442 |
+
return formatted;
|
| 443 |
}
|
| 444 |
|
| 445 |
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
|
|
|
| 449 |
}
|
| 450 |
|
| 451 |
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
| 452 |
+
chargeAmount = this.getChargeAmount(formatted);
|
| 453 |
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 454 |
|
| 455 |
return assignTransferProtocolMeta(`${formatted}`,
|
|
|
|
| 465 |
}
|
| 466 |
|
| 467 |
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
| 468 |
+
chargeAmount = this.getChargeAmount(formatted);
|
| 469 |
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 470 |
|
| 471 |
return assignTransferProtocolMeta(`${formatted}`,
|
|
|
|
| 604 |
}
|
| 605 |
}
|
| 606 |
|
| 607 |
+
getChargeAmount(formatted: { [k: string]: any; }) {
|
| 608 |
+
const textContent = formatted?.content || formatted?.text || formatted?.html;
|
| 609 |
+
|
| 610 |
+
if (typeof textContent === 'string') {
|
| 611 |
+
return estimateToken(textContent);
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
const imageContent = formatted.screenshotUrl || formatted?.screenshot;
|
| 615 |
+
|
| 616 |
+
if (imageContent) {
|
| 617 |
+
// OpenAI image token count for 1024x1024 image
|
| 618 |
+
return 765;
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
return undefined;
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
}
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 584791b789cd483dab18735416744b4d10130993
|