nomagick commited on
Commit
29774ac
·
unverified ·
1 Parent(s): 380bbff

fix: scrapMany and searcher

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -752,7 +752,7 @@ export class CrawlerHost extends RPCHost {
752
  }
753
  };
754
 
755
- Promise.all(
756
  iterators.map((it, idx) => handler(it, idx))
757
  ).finally(() => {
758
  concluded = true;
@@ -767,6 +767,7 @@ export class CrawlerHost extends RPCHost {
767
 
768
  yield results;
769
  }
 
770
  } finally {
771
  for (const x of iterators) {
772
  x.return();
 
752
  }
753
  };
754
 
755
+ Promise.allSettled(
756
  iterators.map((it, idx) => handler(it, idx))
757
  ).finally(() => {
758
  concluded = true;
 
767
 
768
  yield results;
769
  }
770
+ yield results;
771
  } finally {
772
  for (const x of iterators) {
773
  x.return();
backend/functions/src/cloud-functions/{sercher-serper.ts → searcher-serper.ts} RENAMED
@@ -154,7 +154,7 @@ export class SearcherHost extends RPCHost {
154
  delete crawlOpts.timeoutMs;
155
  }
156
 
157
- const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic, crawlOpts,
158
  CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
159
  count,
160
  );
@@ -324,7 +324,7 @@ export class SearcherHost extends RPCHost {
324
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
325
  const mapped = scrapped.map((x, i) => {
326
  const upstreamSearchResult = searchResults[i];
327
- if (!x || (!x.parsed && mode !== 'markdown')) {
328
  return {
329
  url: upstreamSearchResult.link,
330
  title: upstreamSearchResult.title,
@@ -370,7 +370,6 @@ export class SearcherHost extends RPCHost {
370
  }
371
 
372
  const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
373
- filtered.toString = searchResults.toString;
374
 
375
  const resultArray = filtered.map((x, i) => {
376
 
@@ -378,10 +377,11 @@ export class SearcherHost extends RPCHost {
378
  ...x,
379
  toString(this: any) {
380
  if (!this.content && this.description) {
381
- if (this.title) {
 
382
  return `[${i + 1}] Title: ${this.title}
383
  [${i + 1}] URL Source: ${this.url}
384
- [${i + 1}] Description: ${this.description}
385
  `;
386
  }
387
 
@@ -444,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
444
  return formattedPage.title &&
445
  formattedPage.content ||
446
  formattedPage.screenshotUrl ||
 
447
  formattedPage.text ||
448
  formattedPage.html;
449
  }
 
154
  delete crawlOpts.timeoutMs;
155
  }
156
 
157
+ const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts,
158
  CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
159
  count,
160
  );
 
324
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
325
  const mapped = scrapped.map((x, i) => {
326
  const upstreamSearchResult = searchResults[i];
327
+ if (!x) {
328
  return {
329
  url: upstreamSearchResult.link,
330
  title: upstreamSearchResult.title,
 
370
  }
371
 
372
  const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
 
373
 
374
  const resultArray = filtered.map((x, i) => {
375
 
 
377
  ...x,
378
  toString(this: any) {
379
  if (!this.content && this.description) {
380
+ if (this.title || x.textRepresentation) {
381
+ const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
382
  return `[${i + 1}] Title: ${this.title}
383
  [${i + 1}] URL Source: ${this.url}
384
+ [${i + 1}] Description: ${this.description}${textRep}
385
  `;
386
  }
387
 
 
444
  return formattedPage.title &&
445
  formattedPage.content ||
446
  formattedPage.screenshotUrl ||
447
+ formattedPage.pageshotUrl ||
448
  formattedPage.text ||
449
  formattedPage.html;
450
  }
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -155,7 +155,7 @@ export class SearcherHost extends RPCHost {
155
  delete crawlOpts.timeoutMs;
156
  }
157
 
158
- const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
159
  CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
160
  count,
161
  );
@@ -325,7 +325,7 @@ export class SearcherHost extends RPCHost {
325
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
326
  const mapped = scrapped.map((x, i) => {
327
  const upstreamSearchResult = searchResults[i];
328
- if (!x || (!x.parsed && mode !== 'markdown')) {
329
  return {
330
  url: upstreamSearchResult.url,
331
  title: upstreamSearchResult.title,
@@ -371,18 +371,17 @@ export class SearcherHost extends RPCHost {
371
  }
372
 
373
  const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
374
- filtered.toString = searchResults.toString;
375
 
376
  const resultArray = filtered.map((x, i) => {
377
-
378
  return {
379
  ...x,
380
  toString(this: any) {
381
  if (!this.content && this.description) {
382
- if (this.title) {
 
383
  return `[${i + 1}] Title: ${this.title}
384
  [${i + 1}] URL Source: ${this.url}
385
- [${i + 1}] Description: ${this.description}
386
  `;
387
  }
388
 
@@ -445,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
445
  return formattedPage.title &&
446
  formattedPage.content ||
447
  formattedPage.screenshotUrl ||
 
448
  formattedPage.text ||
449
  formattedPage.html;
450
  }
 
155
  delete crawlOpts.timeoutMs;
156
  }
157
 
158
+ const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts,
159
  CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
160
  count,
161
  );
 
325
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
326
  const mapped = scrapped.map((x, i) => {
327
  const upstreamSearchResult = searchResults[i];
328
+ if (!x) {
329
  return {
330
  url: upstreamSearchResult.url,
331
  title: upstreamSearchResult.title,
 
371
  }
372
 
373
  const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
 
374
 
375
  const resultArray = filtered.map((x, i) => {
 
376
  return {
377
  ...x,
378
  toString(this: any) {
379
  if (!this.content && this.description) {
380
+ if (this.title || x.textRepresentation) {
381
+ const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
382
  return `[${i + 1}] Title: ${this.title}
383
  [${i + 1}] URL Source: ${this.url}
384
+ [${i + 1}] Description: ${this.description}${textRep}
385
  `;
386
  }
387
 
 
444
  return formattedPage.title &&
445
  formattedPage.content ||
446
  formattedPage.screenshotUrl ||
447
+ formattedPage.pageshotUrl ||
448
  formattedPage.text ||
449
  formattedPage.html;
450
  }
backend/functions/src/services/jsdom.ts CHANGED
@@ -199,7 +199,7 @@ export class JSDomControl extends AsyncService {
199
  }
200
 
201
  @Threaded()
202
- inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
203
  const t0 = Date.now();
204
  const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
205
  try {
 
199
  }
200
 
201
  @Threaded()
202
+ async inferSnapshot(snapshot: PageSnapshot) {
203
  const t0 = Date.now();
204
  const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
205
  try {
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -101,7 +101,7 @@ export class SnapshotFormatter extends AsyncService {
101
  }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
102
  const t0 = Date.now();
103
  const f = {
104
- ...this.getGeneralSnapshotMixins(snapshot),
105
  };
106
  let modeOK = false;
107
 
@@ -190,6 +190,16 @@ export class SnapshotFormatter extends AsyncService {
190
  const dt = Date.now() - t0;
191
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
192
 
 
 
 
 
 
 
 
 
 
 
193
  return f;
194
  }
195
 
@@ -412,7 +422,7 @@ export class SnapshotFormatter extends AsyncService {
412
  .value();
413
  }
414
  if (this.threadLocal.get('withLinksSummary')) {
415
- const links = this.jsdomControl.inferSnapshot(snapshot).links;
416
 
417
  if (this.threadLocal.get('withLinksSummary') === 'all') {
418
  formatted.links = links;
@@ -482,11 +492,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
482
  return f as FormattedPage;
483
  }
484
 
485
- getGeneralSnapshotMixins(snapshot: PageSnapshot) {
486
  let inferred;
487
  const mixin: any = {};
488
  if (this.threadLocal.get('withImagesSummary')) {
489
- inferred ??= this.jsdomControl.inferSnapshot(snapshot);
490
  const imageSummary = {} as { [k: string]: string; };
491
  const imageIdxTrack = new Map<string, number[]>();
492
 
@@ -511,7 +521,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
511
  .value();
512
  }
513
  if (this.threadLocal.get('withLinksSummary')) {
514
- inferred ??= this.jsdomControl.inferSnapshot(snapshot);
515
  if (this.threadLocal.get('withLinksSummary') === 'all') {
516
  mixin.links = inferred.links;
517
  } else {
 
101
  }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
102
  const t0 = Date.now();
103
  const f = {
104
+ ...(await this.getGeneralSnapshotMixins(snapshot)),
105
  };
106
  let modeOK = false;
107
 
 
190
  const dt = Date.now() - t0;
191
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
192
 
193
+ const formatted: FormattedPage = {
194
+ title: (snapshot.parsed?.title || snapshot.title || '').trim(),
195
+ description: (snapshot.description || '').trim(),
196
+ url: nominalUrl?.toString() || snapshot.href?.trim(),
197
+ publishedTime: snapshot.parsed?.publishedTime || undefined,
198
+ [Symbol.dispose]: () => { },
199
+ };
200
+
201
+ Object.assign(f, formatted);
202
+
203
  return f;
204
  }
205
 
 
422
  .value();
423
  }
424
  if (this.threadLocal.get('withLinksSummary')) {
425
+ const links = (await this.jsdomControl.inferSnapshot(snapshot)).links;
426
 
427
  if (this.threadLocal.get('withLinksSummary') === 'all') {
428
  formatted.links = links;
 
492
  return f as FormattedPage;
493
  }
494
 
495
+ async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
496
  let inferred;
497
  const mixin: any = {};
498
  if (this.threadLocal.get('withImagesSummary')) {
499
+ inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
500
  const imageSummary = {} as { [k: string]: string; };
501
  const imageIdxTrack = new Map<string, number[]>();
502
 
 
521
  .value();
522
  }
523
  if (this.threadLocal.get('withLinksSummary')) {
524
+ inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
525
  if (this.threadLocal.get('withLinksSummary') === 'all') {
526
  mixin.links = inferred.links;
527
  } else {
backend/functions/src/stand-alone/search.ts CHANGED
@@ -15,7 +15,7 @@ import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
15
  import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
16
  import { ExpressServer } from 'civkit/civ-rpc/express';
17
  import http2 from 'http2';
18
- import { SearcherHost } from '../cloud-functions/sercher-serper';
19
  import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
20
  import path from 'path';
21
  import fs from 'fs';
 
15
  import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
16
  import { ExpressServer } from 'civkit/civ-rpc/express';
17
  import http2 from 'http2';
18
+ import { SearcherHost } from '../cloud-functions/searcher';
19
  import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
20
  import path from 'path';
21
  import fs from 'fs';
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 5e25cdd295bdbc41422055491532ea713c142b45
 
1
+ Subproject commit b5e688359eaa87538ef5f43c1323ab92eca8ea33