nomagick commited on
Commit
696536c
·
unverified ·
1 Parent(s): b9d07e3

feat(crawl): token budget

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -4,7 +4,7 @@ import {
4
  AssertionFailureError, ParamValidationError, Defer,
5
  } from 'civkit';
6
  import { singleton } from 'tsyringe';
7
- import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
8
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
9
  import _ from 'lodash';
10
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@@ -202,6 +202,9 @@ export class CrawlerHost extends RPCHost {
202
  );
203
 
204
  rpcReflect.finally(() => {
 
 
 
205
  if (chargeAmount) {
206
  auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
207
  this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
@@ -218,6 +221,9 @@ export class CrawlerHost extends RPCHost {
218
  );
219
 
220
  rpcReflect.finally(() => {
 
 
 
221
  if (chargeAmount) {
222
  apiRoll._ref?.set({
223
  chargeAmount,
@@ -252,6 +258,9 @@ export class CrawlerHost extends RPCHost {
252
 
253
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
254
  chargeAmount = this.assignChargeAmount(formatted);
 
 
 
255
  sseStream.write({
256
  event: 'data',
257
  data: formatted,
@@ -284,6 +293,10 @@ export class CrawlerHost extends RPCHost {
284
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
285
  chargeAmount = this.assignChargeAmount(formatted);
286
 
 
 
 
 
287
  if (crawlerOptions.isEarlyReturnApplicable()) {
288
  return formatted;
289
  }
@@ -302,6 +315,9 @@ export class CrawlerHost extends RPCHost {
302
 
303
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
304
  chargeAmount = this.assignChargeAmount(formatted);
 
 
 
305
 
306
  return formatted;
307
  }
@@ -321,6 +337,9 @@ export class CrawlerHost extends RPCHost {
321
 
322
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
323
  chargeAmount = this.assignChargeAmount(formatted);
 
 
 
324
 
325
  if (crawlerOptions.isEarlyReturnApplicable()) {
326
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
@@ -349,6 +368,10 @@ export class CrawlerHost extends RPCHost {
349
 
350
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
351
  chargeAmount = this.assignChargeAmount(formatted);
 
 
 
 
352
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
353
 
354
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
 
4
  AssertionFailureError, ParamValidationError, Defer,
5
  } from 'civkit';
6
  import { singleton } from 'tsyringe';
7
+ import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
8
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
9
  import _ from 'lodash';
10
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
 
202
  );
203
 
204
  rpcReflect.finally(() => {
205
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
206
+ return;
207
+ }
208
  if (chargeAmount) {
209
  auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
210
  this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
 
221
  );
222
 
223
  rpcReflect.finally(() => {
224
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
225
+ return;
226
+ }
227
  if (chargeAmount) {
228
  apiRoll._ref?.set({
229
  chargeAmount,
 
258
 
259
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
260
  chargeAmount = this.assignChargeAmount(formatted);
261
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
262
+ throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
263
+ }
264
  sseStream.write({
265
  event: 'data',
266
  data: formatted,
 
293
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
294
  chargeAmount = this.assignChargeAmount(formatted);
295
 
296
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
297
+ throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
298
+ }
299
+
300
  if (crawlerOptions.isEarlyReturnApplicable()) {
301
  return formatted;
302
  }
 
315
 
316
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
317
  chargeAmount = this.assignChargeAmount(formatted);
318
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
319
+ throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
320
+ }
321
 
322
  return formatted;
323
  }
 
337
 
338
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
339
  chargeAmount = this.assignChargeAmount(formatted);
340
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
341
+ throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
342
+ }
343
 
344
  if (crawlerOptions.isEarlyReturnApplicable()) {
345
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
 
368
 
369
  const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
370
  chargeAmount = this.assignChargeAmount(formatted);
371
+ if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
372
+ throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
373
+ }
374
+
375
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
376
 
377
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -156,6 +156,11 @@ const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
156
  description: 'Specify referer for the page.',
157
  in: 'header',
158
  schema: { type: 'string' }
 
 
 
 
 
159
  }
160
  }
161
  }
@@ -271,6 +276,9 @@ export class CrawlerOptions extends AutoCastable {
271
  @Prop()
272
  referer?: string;
273
 
 
 
 
274
  static override from(input: any) {
275
  const instance = super.from(input) as CrawlerOptions;
276
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@@ -387,6 +395,13 @@ export class CrawlerOptions extends AutoCastable {
387
  instance.cacheTolerance = instance.cacheTolerance * 1000;
388
  }
389
 
 
 
 
 
 
 
 
390
  return instance;
391
  }
392
 
 
156
  description: 'Specify referer for the page.',
157
  in: 'header',
158
  schema: { type: 'string' }
159
+ },
160
+ 'X-Token-Budget': {
161
+ description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
162
+ in: 'header',
163
+ schema: { type: 'string' }
164
  }
165
  }
166
  }
 
276
  @Prop()
277
  referer?: string;
278
 
279
+ @Prop()
280
+ tokenBudget?: number;
281
+
282
  static override from(input: any) {
283
  const instance = super.from(input) as CrawlerOptions;
284
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
 
395
  instance.cacheTolerance = instance.cacheTolerance * 1000;
396
  }
397
 
398
+ const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
399
+ instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
400
+
401
+ if (instance.cacheTolerance) {
402
+ instance.cacheTolerance = instance.cacheTolerance * 1000;
403
+ }
404
+
405
  return instance;
406
  }
407
 
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 7b3412e64166599429fa38094f4abd071a15fcd6
 
1
+ Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf