nomagick commited on
Commit
f101664
·
unverified ·
1 Parent(s): 9dd5af0

fix: firebase limit on document size causing cache failures

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -13,7 +13,7 @@ import normalizeUrl from "@esm2cjs/normalize-url";
13
  import { AltTextService } from '../services/alt-text';
14
  import TurndownService from 'turndown';
15
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
16
- import { CookieParam } from 'puppeteer';
17
  import { Crawled } from '../db/crawled';
18
  import { tidyMarkdown } from '../utils/markdown';
19
  import { cleanAttribute } from '../utils/misc';
@@ -408,28 +408,44 @@ ${this.content}
408
 
409
  const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
410
 
411
- if (cache) {
412
- const age = Date.now() - cache.createdAt.valueOf();
413
- const stale = cache.createdAt.valueOf() > (Date.now() - this.cacheValidMs);
414
- this.logger.info(`${stale ? 'Only stale ' : ''}Cache exists for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
415
- url: urlToCrawl, digest, age, stale
416
- });
417
 
418
- const r = cache.snapshot;
 
 
 
 
419
 
420
- return {
421
- isFresh: !stale,
422
- ...cache,
423
- snapshot: {
424
- ...r,
425
- screenshot: undefined,
426
- screenshotUrl: cache.screenshotAvailable ?
427
- await this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs) : undefined,
428
- } as PageSnapshot & { screenshotUrl?: string; }
429
- };
 
 
 
 
 
 
 
430
  }
431
 
432
- return undefined;
 
 
 
 
 
 
 
 
433
  }
434
 
435
  async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
@@ -444,10 +460,24 @@ ${this.content}
444
  createdAt: nowDate,
445
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
446
  urlPathDigest: digest,
447
- snapshot: {
448
- ...snapshot,
449
- screenshot: null
450
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  });
452
 
453
  if (snapshot.screenshot) {
@@ -458,6 +488,7 @@ ${this.content}
458
  });
459
  cache.screenshotAvailable = true;
460
  }
 
461
  const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
462
  this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
463
 
 
13
  import { AltTextService } from '../services/alt-text';
14
  import TurndownService from 'turndown';
15
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
16
+ import type { CookieParam } from 'puppeteer';
17
  import { Crawled } from '../db/crawled';
18
  import { tidyMarkdown } from '../utils/markdown';
19
  import { cleanAttribute } from '../utils/misc';
 
408
 
409
  const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
410
 
411
+ if (!cache) {
412
+ return undefined;
413
+ }
 
 
 
414
 
415
+ const age = Date.now() - cache.createdAt.valueOf();
416
+ const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
417
+ this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
418
+ url: urlToCrawl, digest, age, stale
419
+ });
420
 
421
+ let snapshot: PageSnapshot | undefined;
422
+ let screenshotUrl: string | undefined;
423
+ const preparations = [
424
+ this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
425
+ snapshot = JSON.parse(r.toString('utf-8'));
426
+ }),
427
+ cache.screenshotAvailable ?
428
+ this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
429
+ screenshotUrl = r;
430
+ }) :
431
+ Promise.resolve(undefined)
432
+ ];
433
+ try {
434
+ await Promise.all(preparations);
435
+ } catch (_err) {
436
+ // Swallow cache errors.
437
+ return undefined;
438
  }
439
 
440
+ return {
441
+ isFresh: !stale,
442
+ ...cache,
443
+ snapshot: {
444
+ ...snapshot,
445
+ screenshot: undefined,
446
+ screenshotUrl,
447
+ } as PageSnapshot & { screenshotUrl?: string; }
448
+ };
449
  }
450
 
451
  async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
 
460
  createdAt: nowDate,
461
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
462
  urlPathDigest: digest,
463
+ });
464
+
465
+ const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`,
466
+ Buffer.from(
467
+ JSON.stringify({
468
+ ...snapshot,
469
+ screenshot: undefined
470
+ }),
471
+ 'utf-8'
472
+ ),
473
+ {
474
+ metadata: {
475
+ contentType: 'application/json',
476
+ }
477
+ }
478
+ ).then((r) => {
479
+ cache.snapshotAvailable = true;
480
+ return r;
481
  });
482
 
483
  if (snapshot.screenshot) {
 
488
  });
489
  cache.screenshotAvailable = true;
490
  }
491
+ await savingOfSnapshot;
492
  const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
493
  this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
494
 
backend/functions/src/db/crawled.ts CHANGED
@@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
22
  urlPathDigest!: string;
23
 
24
  @Prop()
25
- snapshot!: PageSnapshot & { screenshot: never; };
26
 
27
  @Prop()
28
  screenshotAvailable?: boolean;
29
 
 
 
 
30
  @Prop()
31
  createdAt!: Date;
32
 
 
22
  urlPathDigest!: string;
23
 
24
  @Prop()
25
+ snapshot?: PageSnapshot & { screenshot: never; };
26
 
27
  @Prop()
28
  screenshotAvailable?: boolean;
29
 
30
+ @Prop()
31
+ snapshotAvailable?: boolean;
32
+
33
  @Prop()
34
  createdAt!: Date;
35
 
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 577131db50d5c86ffb3d085a593eaed8950eabcd
 
1
+ Subproject commit 64157bc57ef9ce2cec69f37b5f55fccb71742b6f