Spaces:
Build error
Build error
fix: scrapMany and searcher
Browse files- backend/functions/src/cloud-functions/crawler.ts +2 -1
- backend/functions/src/cloud-functions/{sercher-serper.ts → searcher-serper.ts} +6 -5
- backend/functions/src/cloud-functions/searcher.ts +6 -6
- backend/functions/src/services/jsdom.ts +1 -1
- backend/functions/src/services/snapshot-formatter.ts +15 -5
- backend/functions/src/stand-alone/search.ts +1 -1
- thinapps-shared +1 -1
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -752,7 +752,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 752 |
}
|
| 753 |
};
|
| 754 |
|
| 755 |
-
Promise.
|
| 756 |
iterators.map((it, idx) => handler(it, idx))
|
| 757 |
).finally(() => {
|
| 758 |
concluded = true;
|
|
@@ -767,6 +767,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 767 |
|
| 768 |
yield results;
|
| 769 |
}
|
|
|
|
| 770 |
} finally {
|
| 771 |
for (const x of iterators) {
|
| 772 |
x.return();
|
|
|
|
| 752 |
}
|
| 753 |
};
|
| 754 |
|
| 755 |
+
Promise.allSettled(
|
| 756 |
iterators.map((it, idx) => handler(it, idx))
|
| 757 |
).finally(() => {
|
| 758 |
concluded = true;
|
|
|
|
| 767 |
|
| 768 |
yield results;
|
| 769 |
}
|
| 770 |
+
yield results;
|
| 771 |
} finally {
|
| 772 |
for (const x of iterators) {
|
| 773 |
x.return();
|
backend/functions/src/cloud-functions/{sercher-serper.ts → searcher-serper.ts}
RENAMED
|
@@ -154,7 +154,7 @@ export class SearcherHost extends RPCHost {
|
|
| 154 |
delete crawlOpts.timeoutMs;
|
| 155 |
}
|
| 156 |
|
| 157 |
-
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic, crawlOpts,
|
| 158 |
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 159 |
count,
|
| 160 |
);
|
|
@@ -324,7 +324,7 @@ export class SearcherHost extends RPCHost {
|
|
| 324 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 325 |
const mapped = scrapped.map((x, i) => {
|
| 326 |
const upstreamSearchResult = searchResults[i];
|
| 327 |
-
if (!x
|
| 328 |
return {
|
| 329 |
url: upstreamSearchResult.link,
|
| 330 |
title: upstreamSearchResult.title,
|
|
@@ -370,7 +370,6 @@ export class SearcherHost extends RPCHost {
|
|
| 370 |
}
|
| 371 |
|
| 372 |
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
| 373 |
-
filtered.toString = searchResults.toString;
|
| 374 |
|
| 375 |
const resultArray = filtered.map((x, i) => {
|
| 376 |
|
|
@@ -378,10 +377,11 @@ export class SearcherHost extends RPCHost {
|
|
| 378 |
...x,
|
| 379 |
toString(this: any) {
|
| 380 |
if (!this.content && this.description) {
|
| 381 |
-
if (this.title) {
|
|
|
|
| 382 |
return `[${i + 1}] Title: ${this.title}
|
| 383 |
[${i + 1}] URL Source: ${this.url}
|
| 384 |
-
[${i + 1}] Description: ${this.description}
|
| 385 |
`;
|
| 386 |
}
|
| 387 |
|
|
@@ -444,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|
| 444 |
return formattedPage.title &&
|
| 445 |
formattedPage.content ||
|
| 446 |
formattedPage.screenshotUrl ||
|
|
|
|
| 447 |
formattedPage.text ||
|
| 448 |
formattedPage.html;
|
| 449 |
}
|
|
|
|
| 154 |
delete crawlOpts.timeoutMs;
|
| 155 |
}
|
| 156 |
|
| 157 |
+
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts,
|
| 158 |
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 159 |
count,
|
| 160 |
);
|
|
|
|
| 324 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 325 |
const mapped = scrapped.map((x, i) => {
|
| 326 |
const upstreamSearchResult = searchResults[i];
|
| 327 |
+
if (!x) {
|
| 328 |
return {
|
| 329 |
url: upstreamSearchResult.link,
|
| 330 |
title: upstreamSearchResult.title,
|
|
|
|
| 370 |
}
|
| 371 |
|
| 372 |
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
|
|
|
| 373 |
|
| 374 |
const resultArray = filtered.map((x, i) => {
|
| 375 |
|
|
|
|
| 377 |
...x,
|
| 378 |
toString(this: any) {
|
| 379 |
if (!this.content && this.description) {
|
| 380 |
+
if (this.title || x.textRepresentation) {
|
| 381 |
+
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
| 382 |
return `[${i + 1}] Title: ${this.title}
|
| 383 |
[${i + 1}] URL Source: ${this.url}
|
| 384 |
+
[${i + 1}] Description: ${this.description}${textRep}
|
| 385 |
`;
|
| 386 |
}
|
| 387 |
|
|
|
|
| 444 |
return formattedPage.title &&
|
| 445 |
formattedPage.content ||
|
| 446 |
formattedPage.screenshotUrl ||
|
| 447 |
+
formattedPage.pageshotUrl ||
|
| 448 |
formattedPage.text ||
|
| 449 |
formattedPage.html;
|
| 450 |
}
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -155,7 +155,7 @@ export class SearcherHost extends RPCHost {
|
|
| 155 |
delete crawlOpts.timeoutMs;
|
| 156 |
}
|
| 157 |
|
| 158 |
-
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 159 |
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 160 |
count,
|
| 161 |
);
|
|
@@ -325,7 +325,7 @@ export class SearcherHost extends RPCHost {
|
|
| 325 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 326 |
const mapped = scrapped.map((x, i) => {
|
| 327 |
const upstreamSearchResult = searchResults[i];
|
| 328 |
-
if (!x
|
| 329 |
return {
|
| 330 |
url: upstreamSearchResult.url,
|
| 331 |
title: upstreamSearchResult.title,
|
|
@@ -371,18 +371,17 @@ export class SearcherHost extends RPCHost {
|
|
| 371 |
}
|
| 372 |
|
| 373 |
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
| 374 |
-
filtered.toString = searchResults.toString;
|
| 375 |
|
| 376 |
const resultArray = filtered.map((x, i) => {
|
| 377 |
-
|
| 378 |
return {
|
| 379 |
...x,
|
| 380 |
toString(this: any) {
|
| 381 |
if (!this.content && this.description) {
|
| 382 |
-
if (this.title) {
|
|
|
|
| 383 |
return `[${i + 1}] Title: ${this.title}
|
| 384 |
[${i + 1}] URL Source: ${this.url}
|
| 385 |
-
[${i + 1}] Description: ${this.description}
|
| 386 |
`;
|
| 387 |
}
|
| 388 |
|
|
@@ -445,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|
| 445 |
return formattedPage.title &&
|
| 446 |
formattedPage.content ||
|
| 447 |
formattedPage.screenshotUrl ||
|
|
|
|
| 448 |
formattedPage.text ||
|
| 449 |
formattedPage.html;
|
| 450 |
}
|
|
|
|
| 155 |
delete crawlOpts.timeoutMs;
|
| 156 |
}
|
| 157 |
|
| 158 |
+
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts,
|
| 159 |
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 160 |
count,
|
| 161 |
);
|
|
|
|
| 325 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 326 |
const mapped = scrapped.map((x, i) => {
|
| 327 |
const upstreamSearchResult = searchResults[i];
|
| 328 |
+
if (!x) {
|
| 329 |
return {
|
| 330 |
url: upstreamSearchResult.url,
|
| 331 |
title: upstreamSearchResult.title,
|
|
|
|
| 371 |
}
|
| 372 |
|
| 373 |
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
|
|
|
| 374 |
|
| 375 |
const resultArray = filtered.map((x, i) => {
|
|
|
|
| 376 |
return {
|
| 377 |
...x,
|
| 378 |
toString(this: any) {
|
| 379 |
if (!this.content && this.description) {
|
| 380 |
+
if (this.title || x.textRepresentation) {
|
| 381 |
+
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
| 382 |
return `[${i + 1}] Title: ${this.title}
|
| 383 |
[${i + 1}] URL Source: ${this.url}
|
| 384 |
+
[${i + 1}] Description: ${this.description}${textRep}
|
| 385 |
`;
|
| 386 |
}
|
| 387 |
|
|
|
|
| 444 |
return formattedPage.title &&
|
| 445 |
formattedPage.content ||
|
| 446 |
formattedPage.screenshotUrl ||
|
| 447 |
+
formattedPage.pageshotUrl ||
|
| 448 |
formattedPage.text ||
|
| 449 |
formattedPage.html;
|
| 450 |
}
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -199,7 +199,7 @@ export class JSDomControl extends AsyncService {
|
|
| 199 |
}
|
| 200 |
|
| 201 |
@Threaded()
|
| 202 |
-
inferSnapshot(snapshot: PageSnapshot)
|
| 203 |
const t0 = Date.now();
|
| 204 |
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 205 |
try {
|
|
|
|
| 199 |
}
|
| 200 |
|
| 201 |
@Threaded()
|
| 202 |
+
async inferSnapshot(snapshot: PageSnapshot) {
|
| 203 |
const t0 = Date.now();
|
| 204 |
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 205 |
try {
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -101,7 +101,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 101 |
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
| 102 |
const t0 = Date.now();
|
| 103 |
const f = {
|
| 104 |
-
...this.getGeneralSnapshotMixins(snapshot),
|
| 105 |
};
|
| 106 |
let modeOK = false;
|
| 107 |
|
|
@@ -190,6 +190,16 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 190 |
const dt = Date.now() - t0;
|
| 191 |
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
return f;
|
| 194 |
}
|
| 195 |
|
|
@@ -412,7 +422,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 412 |
.value();
|
| 413 |
}
|
| 414 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 415 |
-
const links = this.jsdomControl.inferSnapshot(snapshot).links;
|
| 416 |
|
| 417 |
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
| 418 |
formatted.links = links;
|
|
@@ -482,11 +492,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 482 |
return f as FormattedPage;
|
| 483 |
}
|
| 484 |
|
| 485 |
-
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 486 |
let inferred;
|
| 487 |
const mixin: any = {};
|
| 488 |
if (this.threadLocal.get('withImagesSummary')) {
|
| 489 |
-
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 490 |
const imageSummary = {} as { [k: string]: string; };
|
| 491 |
const imageIdxTrack = new Map<string, number[]>();
|
| 492 |
|
|
@@ -511,7 +521,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 511 |
.value();
|
| 512 |
}
|
| 513 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 514 |
-
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 515 |
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
| 516 |
mixin.links = inferred.links;
|
| 517 |
} else {
|
|
|
|
| 101 |
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
| 102 |
const t0 = Date.now();
|
| 103 |
const f = {
|
| 104 |
+
...(await this.getGeneralSnapshotMixins(snapshot)),
|
| 105 |
};
|
| 106 |
let modeOK = false;
|
| 107 |
|
|
|
|
| 190 |
const dt = Date.now() - t0;
|
| 191 |
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 192 |
|
| 193 |
+
const formatted: FormattedPage = {
|
| 194 |
+
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 195 |
+
description: (snapshot.description || '').trim(),
|
| 196 |
+
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
| 197 |
+
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
| 198 |
+
[Symbol.dispose]: () => { },
|
| 199 |
+
};
|
| 200 |
+
|
| 201 |
+
Object.assign(f, formatted);
|
| 202 |
+
|
| 203 |
return f;
|
| 204 |
}
|
| 205 |
|
|
|
|
| 422 |
.value();
|
| 423 |
}
|
| 424 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 425 |
+
const links = (await this.jsdomControl.inferSnapshot(snapshot)).links;
|
| 426 |
|
| 427 |
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
| 428 |
formatted.links = links;
|
|
|
|
| 492 |
return f as FormattedPage;
|
| 493 |
}
|
| 494 |
|
| 495 |
+
async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 496 |
let inferred;
|
| 497 |
const mixin: any = {};
|
| 498 |
if (this.threadLocal.get('withImagesSummary')) {
|
| 499 |
+
inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
|
| 500 |
const imageSummary = {} as { [k: string]: string; };
|
| 501 |
const imageIdxTrack = new Map<string, number[]>();
|
| 502 |
|
|
|
|
| 521 |
.value();
|
| 522 |
}
|
| 523 |
if (this.threadLocal.get('withLinksSummary')) {
|
| 524 |
+
inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
|
| 525 |
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
| 526 |
mixin.links = inferred.links;
|
| 527 |
} else {
|
backend/functions/src/stand-alone/search.ts
CHANGED
|
@@ -15,7 +15,7 @@ import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
|
|
| 15 |
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
| 16 |
import { ExpressServer } from 'civkit/civ-rpc/express';
|
| 17 |
import http2 from 'http2';
|
| 18 |
-
import { SearcherHost } from '../cloud-functions/
|
| 19 |
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
| 20 |
import path from 'path';
|
| 21 |
import fs from 'fs';
|
|
|
|
| 15 |
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
| 16 |
import { ExpressServer } from 'civkit/civ-rpc/express';
|
| 17 |
import http2 from 'http2';
|
| 18 |
+
import { SearcherHost } from '../cloud-functions/searcher';
|
| 19 |
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
| 20 |
import path from 'path';
|
| 21 |
import fs from 'fs';
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit b5e688359eaa87538ef5f43c1323ab92eca8ea33
|