Spaces:
Build error
Build error
feat(crawl): token budget
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
| 4 |
AssertionFailureError, ParamValidationError, Defer,
|
| 5 |
} from 'civkit';
|
| 6 |
import { singleton } from 'tsyringe';
|
| 7 |
-
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
| 8 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 9 |
import _ from 'lodash';
|
| 10 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
@@ -202,6 +202,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 202 |
);
|
| 203 |
|
| 204 |
rpcReflect.finally(() => {
|
|
|
|
|
|
|
|
|
|
| 205 |
if (chargeAmount) {
|
| 206 |
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
| 207 |
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
|
@@ -218,6 +221,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 218 |
);
|
| 219 |
|
| 220 |
rpcReflect.finally(() => {
|
|
|
|
|
|
|
|
|
|
| 221 |
if (chargeAmount) {
|
| 222 |
apiRoll._ref?.set({
|
| 223 |
chargeAmount,
|
|
@@ -252,6 +258,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 252 |
|
| 253 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 254 |
chargeAmount = this.assignChargeAmount(formatted);
|
|
|
|
|
|
|
|
|
|
| 255 |
sseStream.write({
|
| 256 |
event: 'data',
|
| 257 |
data: formatted,
|
|
@@ -284,6 +293,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 284 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 285 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
if (crawlerOptions.isEarlyReturnApplicable()) {
|
| 288 |
return formatted;
|
| 289 |
}
|
|
@@ -302,6 +315,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 302 |
|
| 303 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
| 304 |
chargeAmount = this.assignChargeAmount(formatted);
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
return formatted;
|
| 307 |
}
|
|
@@ -321,6 +337,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 321 |
|
| 322 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 323 |
chargeAmount = this.assignChargeAmount(formatted);
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
if (crawlerOptions.isEarlyReturnApplicable()) {
|
| 326 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
|
@@ -349,6 +368,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 349 |
|
| 350 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
| 351 |
chargeAmount = this.assignChargeAmount(formatted);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 353 |
|
| 354 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
|
|
|
| 4 |
AssertionFailureError, ParamValidationError, Defer,
|
| 5 |
} from 'civkit';
|
| 6 |
import { singleton } from 'tsyringe';
|
| 7 |
+
import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
| 8 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 9 |
import _ from 'lodash';
|
| 10 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
|
|
| 202 |
);
|
| 203 |
|
| 204 |
rpcReflect.finally(() => {
|
| 205 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 206 |
+
return;
|
| 207 |
+
}
|
| 208 |
if (chargeAmount) {
|
| 209 |
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
| 210 |
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
|
|
|
| 221 |
);
|
| 222 |
|
| 223 |
rpcReflect.finally(() => {
|
| 224 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 225 |
+
return;
|
| 226 |
+
}
|
| 227 |
if (chargeAmount) {
|
| 228 |
apiRoll._ref?.set({
|
| 229 |
chargeAmount,
|
|
|
|
| 258 |
|
| 259 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 260 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 261 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 262 |
+
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 263 |
+
}
|
| 264 |
sseStream.write({
|
| 265 |
event: 'data',
|
| 266 |
data: formatted,
|
|
|
|
| 293 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 294 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 295 |
|
| 296 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 297 |
+
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
if (crawlerOptions.isEarlyReturnApplicable()) {
|
| 301 |
return formatted;
|
| 302 |
}
|
|
|
|
| 315 |
|
| 316 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
| 317 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 318 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 319 |
+
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 320 |
+
}
|
| 321 |
|
| 322 |
return formatted;
|
| 323 |
}
|
|
|
|
| 337 |
|
| 338 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 339 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 340 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 341 |
+
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 342 |
+
}
|
| 343 |
|
| 344 |
if (crawlerOptions.isEarlyReturnApplicable()) {
|
| 345 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
|
|
|
| 368 |
|
| 369 |
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
| 370 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 371 |
+
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 372 |
+
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 376 |
|
| 377 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -156,6 +156,11 @@ const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
|
|
| 156 |
description: 'Specify referer for the page.',
|
| 157 |
in: 'header',
|
| 158 |
schema: { type: 'string' }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
}
|
| 160 |
}
|
| 161 |
}
|
|
@@ -271,6 +276,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 271 |
@Prop()
|
| 272 |
referer?: string;
|
| 273 |
|
|
|
|
|
|
|
|
|
|
| 274 |
static override from(input: any) {
|
| 275 |
const instance = super.from(input) as CrawlerOptions;
|
| 276 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
|
@@ -387,6 +395,13 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 387 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 388 |
}
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
return instance;
|
| 391 |
}
|
| 392 |
|
|
|
|
| 156 |
description: 'Specify referer for the page.',
|
| 157 |
in: 'header',
|
| 158 |
schema: { type: 'string' }
|
| 159 |
+
},
|
| 160 |
+
'X-Token-Budget': {
|
| 161 |
+
description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
|
| 162 |
+
in: 'header',
|
| 163 |
+
schema: { type: 'string' }
|
| 164 |
}
|
| 165 |
}
|
| 166 |
}
|
|
|
|
| 276 |
@Prop()
|
| 277 |
referer?: string;
|
| 278 |
|
| 279 |
+
@Prop()
|
| 280 |
+
tokenBudget?: number;
|
| 281 |
+
|
| 282 |
static override from(input: any) {
|
| 283 |
const instance = super.from(input) as CrawlerOptions;
|
| 284 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
|
|
|
| 395 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 396 |
}
|
| 397 |
|
| 398 |
+
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
|
| 399 |
+
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
| 400 |
+
|
| 401 |
+
if (instance.cacheTolerance) {
|
| 402 |
+
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
return instance;
|
| 406 |
}
|
| 407 |
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf
|