nomagick commited on
Commit
a9936d3
·
unverified ·
1 Parent(s): 165cce6

fix: search descriptions

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -870,6 +870,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
870
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
871
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
872
  this.threadLocal.set('userAgent', opts.userAgent);
 
 
 
873
 
874
  const crawlOpts: ExtraScrappingOptions = {
875
  proxyUrl: opts.proxyUrl,
@@ -878,6 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
878
  waitForSelector: opts.waitForSelector,
879
  targetSelector: opts.targetSelector,
880
  overrideUserAgent: opts.userAgent,
 
881
  };
882
 
883
  return crawlOpts;
 
870
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
871
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
872
  this.threadLocal.set('userAgent', opts.userAgent);
873
+ if (opts.timeout) {
874
+ this.threadLocal.set('timeout', opts.timeout * 1000);
875
+ }
876
 
877
  const crawlOpts: ExtraScrappingOptions = {
878
  proxyUrl: opts.proxyUrl,
 
881
  waitForSelector: opts.waitForSelector,
882
  targetSelector: opts.targetSelector,
883
  overrideUserAgent: opts.userAgent,
884
+ timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
885
  };
886
 
887
  return crawlOpts;
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -30,7 +30,7 @@ export class SearcherHost extends RPCHost {
30
  cacheValidMs = 1000 * 3600;
31
  pageCacheToleranceMs = 1000 * 3600 * 24;
32
 
33
- reasonableDelayMs = 10_000;
34
 
35
  targetResultCount = 5;
36
 
@@ -163,6 +163,10 @@ export class SearcherHost extends RPCHost {
163
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
164
  }
165
 
 
 
 
 
166
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
167
  crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
168
  );
@@ -213,7 +217,7 @@ export class SearcherHost extends RPCHost {
213
  chargeAmount = this.getChargeAmount(lastScrapped);
214
  rpcReflect.return(lastScrapped);
215
  earlyReturn = true;
216
- }, this.reasonableDelayMs);
217
  };
218
 
219
  for await (const scrapped of it) {
@@ -259,7 +263,7 @@ export class SearcherHost extends RPCHost {
259
  chargeAmount = this.getChargeAmount(lastScrapped);
260
  rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
261
  earlyReturn = true;
262
- }, this.reasonableDelayMs);
263
  };
264
 
265
  for await (const scrapped of it) {
@@ -317,7 +321,12 @@ export class SearcherHost extends RPCHost {
317
  description: upstreamSearchResult.description,
318
  };
319
  }
320
- return this.crawler.formatSnapshot(mode, x, urls[i]);
 
 
 
 
 
321
  });
322
 
323
  const resultArray = await Promise.all(mapped) as FormattedPage[];
@@ -343,7 +352,7 @@ export class SearcherHost extends RPCHost {
343
  return {
344
  ...x,
345
  toString(this: any) {
346
- if (this.description) {
347
  if (this.title) {
348
  return `[${i + 1}] Title: ${this.title}
349
  [${i + 1}] URL Source: ${this.url}
@@ -355,6 +364,9 @@ export class SearcherHost extends RPCHost {
355
  }
356
 
357
  const mixins = [];
 
 
 
358
  if (this.publishedTime) {
359
  mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
360
  }
 
30
  cacheValidMs = 1000 * 3600;
31
  pageCacheToleranceMs = 1000 * 3600 * 24;
32
 
33
+ reasonableDelayMs = 15_000;
34
 
35
  targetResultCount = 5;
36
 
 
163
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
164
  }
165
 
166
+ if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
167
+ delete crawlOpts.timeoutMs;
168
+ }
169
+
170
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
171
  crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
172
  );
 
217
  chargeAmount = this.getChargeAmount(lastScrapped);
218
  rpcReflect.return(lastScrapped);
219
  earlyReturn = true;
220
+ }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
221
  };
222
 
223
  for await (const scrapped of it) {
 
263
  chargeAmount = this.getChargeAmount(lastScrapped);
264
  rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
265
  earlyReturn = true;
266
+ }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
267
  };
268
 
269
  for await (const scrapped of it) {
 
321
  description: upstreamSearchResult.description,
322
  };
323
  }
324
+ return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
325
+ r.title ??= upstreamSearchResult.title;
326
+ r.description = upstreamSearchResult.description;
327
+
328
+ return r;
329
+ });
330
  });
331
 
332
  const resultArray = await Promise.all(mapped) as FormattedPage[];
 
352
  return {
353
  ...x,
354
  toString(this: any) {
355
+ if (!this.content && this.description) {
356
  if (this.title) {
357
  return `[${i + 1}] Title: ${this.title}
358
  [${i + 1}] URL Source: ${this.url}
 
364
  }
365
 
366
  const mixins = [];
367
+ if (this.description) {
368
+ mixins.push(`[${i + 1}] Description: ${this.description}`);
369
+ }
370
  if (this.publishedTime) {
371
  mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
372
  }
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -91,6 +91,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
91
  in: 'header',
92
  schema: { type: 'string' }
93
  },
 
 
 
 
 
94
  }
95
  }
96
  }
@@ -142,6 +147,11 @@ export class CrawlerOptions extends AutoCastable {
142
  @Prop()
143
  userAgent?: string;
144
 
 
 
 
 
 
145
  static override from(input: any) {
146
  const instance = super.from(input) as CrawlerOptions;
147
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@@ -178,6 +188,11 @@ export class CrawlerOptions extends AutoCastable {
178
  instance.cacheTolerance = cacheTolerance;
179
  }
180
 
 
 
 
 
 
181
  const targetSelector = ctx?.req.get('x-target-selector');
182
  instance.targetSelector ??= targetSelector;
183
  const waitForSelector = ctx?.req.get('x-wait-for-selector');
 
91
  in: 'header',
92
  schema: { type: 'string' }
93
  },
94
+ 'X-Timeout': {
95
+ description: `Specify timeout in seconds. Max 180.`,
96
+ in: 'header',
97
+ schema: { type: 'string' }
98
+ },
99
  }
100
  }
101
  }
 
147
  @Prop()
148
  userAgent?: string;
149
 
150
+ @Prop({
151
+ validate: (v: number) => v > 0 && v <= 180,
152
+ })
153
+ timeout?: number;
154
+
155
  static override from(input: any) {
156
  const instance = super.from(input) as CrawlerOptions;
157
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
 
188
  instance.cacheTolerance = cacheTolerance;
189
  }
190
 
191
+ let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
192
+ if (!isNaN(timeoutSeconds)) {
193
+ instance.timeout = timeoutSeconds;
194
+ }
195
+
196
  const targetSelector = ctx?.req.get('x-target-selector');
197
  instance.targetSelector ??= targetSelector;
198
  const waitForSelector = ctx?.req.get('x-wait-for-selector');
backend/functions/src/services/puppeteer.ts CHANGED
@@ -66,6 +66,7 @@ export interface ScrappingOptions {
66
  waitForSelector?: string;
67
  minIntervalMs?: number;
68
  overrideUserAgent?: string;
 
69
  }
70
 
71
 
@@ -449,7 +450,10 @@ document.addEventListener('load', handlePageLoad);
449
  );
450
  });
451
 
452
- const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
 
 
 
453
  .catch((err) => {
454
  this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
455
  return Promise.reject(new AssertionFailureError({
 
66
  waitForSelector?: string;
67
  minIntervalMs?: number;
68
  overrideUserAgent?: string;
69
+ timeoutMs?: number;
70
  }
71
 
72
 
 
450
  );
451
  });
452
 
453
+ const gotoPromise = page.goto(url, {
454
+ waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
455
+ timeout: options?.timeoutMs || 30_000
456
+ })
457
  .catch((err) => {
458
  this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
459
  return Promise.reject(new AssertionFailureError({