nomagick commited on
Commit
1cf8e83
·
unverified ·
1 Parent(s): d100c3f

fix: add cache tolerance

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -426,6 +426,7 @@ ${this.content}
426
  const customMode = ctx.req.get('x-respond-with') || 'default';
427
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
428
  const noCache = Boolean(ctx.req.get('x-no-cache'));
 
429
  const cookies: CookieParam[] = [];
430
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
431
  if (Array.isArray(setCookieHeaders)) {
@@ -454,7 +455,7 @@ ${this.content}
454
  rpcReflect.return(sseStream);
455
 
456
  try {
457
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
458
  if (!scrapped) {
459
  continue;
460
  }
@@ -481,7 +482,7 @@ ${this.content}
481
 
482
  let lastScrapped;
483
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
484
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
485
  lastScrapped = scrapped;
486
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
487
  continue;
@@ -503,7 +504,7 @@ ${this.content}
503
  return formatted;
504
  }
505
 
506
- for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
507
  lastScrapped = scrapped;
508
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
509
  continue;
@@ -546,7 +547,7 @@ ${this.content}
546
  return digest;
547
  }
548
 
549
- async queryCache(urlToCrawl: URL) {
550
  const digest = this.getUrlDigest(urlToCrawl);
551
 
552
  const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
@@ -556,9 +557,9 @@ ${this.content}
556
  }
557
 
558
  const age = Date.now() - cache.createdAt.valueOf();
559
- const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
560
- this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
561
- url: urlToCrawl, digest, age, stale
562
  });
563
 
564
  let snapshot: PageSnapshot | undefined;
@@ -641,10 +642,10 @@ ${this.content}
641
  return r;
642
  }
643
 
644
- async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
645
  let cache;
646
- if (!noCache && !crawlOpts?.cookies?.length) {
647
- cache = await this.queryCache(urlToCrawl);
648
  }
649
 
650
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
@@ -687,10 +688,10 @@ ${this.content}
687
  }
688
 
689
 
690
- async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
691
- const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
692
 
693
- const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
694
 
695
  let nextDeferred = Defer();
696
  let concluded = false;
 
426
  const customMode = ctx.req.get('x-respond-with') || 'default';
427
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
428
  const noCache = Boolean(ctx.req.get('x-no-cache'));
429
+ const cacheTolerance = noCache ? 0 : this.cacheValidMs;
430
  const cookies: CookieParam[] = [];
431
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
432
  if (Array.isArray(setCookieHeaders)) {
 
455
  rpcReflect.return(sseStream);
456
 
457
  try {
458
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
459
  if (!scrapped) {
460
  continue;
461
  }
 
482
 
483
  let lastScrapped;
484
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
485
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
486
  lastScrapped = scrapped;
487
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
488
  continue;
 
504
  return formatted;
505
  }
506
 
507
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
508
  lastScrapped = scrapped;
509
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
510
  continue;
 
547
  return digest;
548
  }
549
 
550
+ async queryCache(urlToCrawl: URL, cacheTolerance: number) {
551
  const digest = this.getUrlDigest(urlToCrawl);
552
 
553
  const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
 
557
  }
558
 
559
  const age = Date.now() - cache.createdAt.valueOf();
560
+ const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
561
+ this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
562
+ url: urlToCrawl, digest, age, stale, cacheTolerance
563
  });
564
 
565
  let snapshot: PageSnapshot | undefined;
 
642
  return r;
643
  }
644
 
645
+ async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
646
  let cache;
647
+ if (cacheTolerance && !crawlOpts?.cookies?.length) {
648
+ cache = await this.queryCache(urlToCrawl, cacheTolerance);
649
  }
650
 
651
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
 
688
  }
689
 
690
 
691
+ async *scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
692
+ const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
693
 
694
+ const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
695
 
696
  let nextDeferred = Defer();
697
  let concluded = false;
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -18,7 +18,7 @@ import { CookieParam } from 'puppeteer';
18
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
19
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
20
  import { SearchResult } from '../db/searched';
21
- import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
22
 
23
 
24
  @singleton()
@@ -27,6 +27,9 @@ export class SearcherHost extends RPCHost {
27
 
28
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
29
  cacheValidMs = 1000 * 3600;
 
 
 
30
 
31
  constructor(
32
  protected globalLogger: Logger,
@@ -178,6 +181,7 @@ export class SearcherHost extends RPCHost {
178
  const customMode = ctx.req.get('x-respond-with') || 'default';
179
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
180
  const noCache = Boolean(ctx.req.get('x-no-cache'));
 
181
  const cookies: CookieParam[] = [];
182
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
183
  if (Array.isArray(setCookieHeaders)) {
@@ -204,8 +208,7 @@ export class SearcherHost extends RPCHost {
204
  count: 5
205
  });
206
 
207
- const urls = r.web.results.map((x) => new URL(x.url));
208
- const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
209
 
210
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
211
  const sseStream = new OutputServerEventStream();
@@ -238,12 +241,14 @@ export class SearcherHost extends RPCHost {
238
  return sseStream;
239
  }
240
 
 
 
241
  let lastScrapped;
242
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
243
  for await (const scrapped of it) {
244
  lastScrapped = scrapped;
245
 
246
- if (!this.qualified(scrapped)) {
247
  continue;
248
  }
249
 
@@ -264,7 +269,7 @@ export class SearcherHost extends RPCHost {
264
  for await (const scrapped of it) {
265
  lastScrapped = scrapped;
266
 
267
- if (!this.qualified(scrapped)) {
268
  continue;
269
  }
270
  chargeAmount = this.getChargeAmount(scrapped);
@@ -282,18 +287,27 @@ export class SearcherHost extends RPCHost {
282
  }
283
 
284
  async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
285
- urls: URL[], options?: ScrappingOptions, noCache = false) {
286
-
287
- for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
288
  const mapped = scrapped.map((x, i) => {
289
- if (!x) {
 
290
  const p = {
291
- toString() {
292
- return `[${i + 1}] No content available for ${urls[i]}`;
 
 
 
 
 
 
293
  }
294
  };
295
  const r = Object.create(p);
296
- r.url = urls[i].toString();
 
 
297
 
298
  return r;
299
  }
@@ -317,7 +331,7 @@ export class SearcherHost extends RPCHost {
317
  [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
318
  [${i + 1}] Markdown Content:
319
  ${this.content}
320
- `;
321
  };
322
  }
323
  }
 
18
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
19
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
20
  import { SearchResult } from '../db/searched';
21
+ import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
22
 
23
 
24
  @singleton()
 
27
 
28
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
29
  cacheValidMs = 1000 * 3600;
30
+ pageCacheToleranceMs = 1000 * 3600 * 24;
31
+
32
+ reasonableDelayMs = 10_000;
33
 
34
  constructor(
35
  protected globalLogger: Logger,
 
181
  const customMode = ctx.req.get('x-respond-with') || 'default';
182
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
183
  const noCache = Boolean(ctx.req.get('x-no-cache'));
184
+ const pageCacheTolerance = noCache ? 0 : this.pageCacheToleranceMs;
185
  const cookies: CookieParam[] = [];
186
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
187
  if (Array.isArray(setCookieHeaders)) {
 
208
  count: 5
209
  });
210
 
211
+ const it = this.fetchSearchResults(customMode, r.web.results, crawlOpts, pageCacheTolerance);
 
212
 
213
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
214
  const sseStream = new OutputServerEventStream();
 
241
  return sseStream;
242
  }
243
 
244
+ const t0 = Date.now();
245
+
246
  let lastScrapped;
247
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
248
  for await (const scrapped of it) {
249
  lastScrapped = scrapped;
250
 
251
+ if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
252
  continue;
253
  }
254
 
 
269
  for await (const scrapped of it) {
270
  lastScrapped = scrapped;
271
 
272
+ if (!this.qualified(scrapped) && ((Date.now() - t0) < this.reasonableDelayMs)) {
273
  continue;
274
  }
275
  chargeAmount = this.getChargeAmount(scrapped);
 
287
  }
288
 
289
  async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
290
+ searchResults: WebSearchResult[], options?: ScrappingOptions, pageCacheTolerance?: number) {
291
+ const urls = searchResults.map((x) => new URL(x.url));
292
+ for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
293
  const mapped = scrapped.map((x, i) => {
294
+ const upstreamSearchResult = searchResults[i];
295
+ if (!x || (!x.parsed && mode !== 'markdown')) {
296
  const p = {
297
+ toString(this: any) {
298
+ if (this.title && this.description) {
299
+ return `[${i + 1}] Title: ${this.title}
300
+ [${i + 1}] URL Source: ${this.url}
301
+ [${i + 1}] Description: ${this.description}
302
+ `;
303
+ }
304
+ return `[${i + 1}] No content available for ${this.url}`;
305
  }
306
  };
307
  const r = Object.create(p);
308
+ r.url = upstreamSearchResult.url;
309
+ r.title = upstreamSearchResult.title;
310
+ r.description = upstreamSearchResult.description;
311
 
312
  return r;
313
  }
 
331
  [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
332
  [${i + 1}] Markdown Content:
333
  ${this.content}
334
+ `;
335
  };
336
  }
337
  }