Spaces:
Build error
Build error
fix: search descriptions
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -870,6 +870,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 870 |
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
| 871 |
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
| 872 |
this.threadLocal.set('userAgent', opts.userAgent);
|
|
|
|
|
|
|
|
|
|
| 873 |
|
| 874 |
const crawlOpts: ExtraScrappingOptions = {
|
| 875 |
proxyUrl: opts.proxyUrl,
|
|
@@ -878,6 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 878 |
waitForSelector: opts.waitForSelector,
|
| 879 |
targetSelector: opts.targetSelector,
|
| 880 |
overrideUserAgent: opts.userAgent,
|
|
|
|
| 881 |
};
|
| 882 |
|
| 883 |
return crawlOpts;
|
|
|
|
| 870 |
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
| 871 |
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
| 872 |
this.threadLocal.set('userAgent', opts.userAgent);
|
| 873 |
+
if (opts.timeout) {
|
| 874 |
+
this.threadLocal.set('timeout', opts.timeout * 1000);
|
| 875 |
+
}
|
| 876 |
|
| 877 |
const crawlOpts: ExtraScrappingOptions = {
|
| 878 |
proxyUrl: opts.proxyUrl,
|
|
|
|
| 881 |
waitForSelector: opts.waitForSelector,
|
| 882 |
targetSelector: opts.targetSelector,
|
| 883 |
overrideUserAgent: opts.userAgent,
|
| 884 |
+
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
| 885 |
};
|
| 886 |
|
| 887 |
return crawlOpts;
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -30,7 +30,7 @@ export class SearcherHost extends RPCHost {
|
|
| 30 |
cacheValidMs = 1000 * 3600;
|
| 31 |
pageCacheToleranceMs = 1000 * 3600 * 24;
|
| 32 |
|
| 33 |
-
reasonableDelayMs =
|
| 34 |
|
| 35 |
targetResultCount = 5;
|
| 36 |
|
|
@@ -163,6 +163,10 @@ export class SearcherHost extends RPCHost {
|
|
| 163 |
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
| 164 |
}
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 167 |
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
| 168 |
);
|
|
@@ -213,7 +217,7 @@ export class SearcherHost extends RPCHost {
|
|
| 213 |
chargeAmount = this.getChargeAmount(lastScrapped);
|
| 214 |
rpcReflect.return(lastScrapped);
|
| 215 |
earlyReturn = true;
|
| 216 |
-
}, this.reasonableDelayMs);
|
| 217 |
};
|
| 218 |
|
| 219 |
for await (const scrapped of it) {
|
|
@@ -259,7 +263,7 @@ export class SearcherHost extends RPCHost {
|
|
| 259 |
chargeAmount = this.getChargeAmount(lastScrapped);
|
| 260 |
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
| 261 |
earlyReturn = true;
|
| 262 |
-
}, this.reasonableDelayMs);
|
| 263 |
};
|
| 264 |
|
| 265 |
for await (const scrapped of it) {
|
|
@@ -317,7 +321,12 @@ export class SearcherHost extends RPCHost {
|
|
| 317 |
description: upstreamSearchResult.description,
|
| 318 |
};
|
| 319 |
}
|
| 320 |
-
return this.crawler.formatSnapshot(mode, x, urls[i])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
});
|
| 322 |
|
| 323 |
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
|
@@ -343,7 +352,7 @@ export class SearcherHost extends RPCHost {
|
|
| 343 |
return {
|
| 344 |
...x,
|
| 345 |
toString(this: any) {
|
| 346 |
-
if (this.description) {
|
| 347 |
if (this.title) {
|
| 348 |
return `[${i + 1}] Title: ${this.title}
|
| 349 |
[${i + 1}] URL Source: ${this.url}
|
|
@@ -355,6 +364,9 @@ export class SearcherHost extends RPCHost {
|
|
| 355 |
}
|
| 356 |
|
| 357 |
const mixins = [];
|
|
|
|
|
|
|
|
|
|
| 358 |
if (this.publishedTime) {
|
| 359 |
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
| 360 |
}
|
|
|
|
| 30 |
cacheValidMs = 1000 * 3600;
|
| 31 |
pageCacheToleranceMs = 1000 * 3600 * 24;
|
| 32 |
|
| 33 |
+
reasonableDelayMs = 15_000;
|
| 34 |
|
| 35 |
targetResultCount = 5;
|
| 36 |
|
|
|
|
| 163 |
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
| 164 |
}
|
| 165 |
|
| 166 |
+
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
|
| 167 |
+
delete crawlOpts.timeoutMs;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
| 171 |
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
| 172 |
);
|
|
|
|
| 217 |
chargeAmount = this.getChargeAmount(lastScrapped);
|
| 218 |
rpcReflect.return(lastScrapped);
|
| 219 |
earlyReturn = true;
|
| 220 |
+
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
| 221 |
};
|
| 222 |
|
| 223 |
for await (const scrapped of it) {
|
|
|
|
| 263 |
chargeAmount = this.getChargeAmount(lastScrapped);
|
| 264 |
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
| 265 |
earlyReturn = true;
|
| 266 |
+
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
| 267 |
};
|
| 268 |
|
| 269 |
for await (const scrapped of it) {
|
|
|
|
| 321 |
description: upstreamSearchResult.description,
|
| 322 |
};
|
| 323 |
}
|
| 324 |
+
return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
|
| 325 |
+
r.title ??= upstreamSearchResult.title;
|
| 326 |
+
r.description = upstreamSearchResult.description;
|
| 327 |
+
|
| 328 |
+
return r;
|
| 329 |
+
});
|
| 330 |
});
|
| 331 |
|
| 332 |
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
|
|
|
| 352 |
return {
|
| 353 |
...x,
|
| 354 |
toString(this: any) {
|
| 355 |
+
if (!this.content && this.description) {
|
| 356 |
if (this.title) {
|
| 357 |
return `[${i + 1}] Title: ${this.title}
|
| 358 |
[${i + 1}] URL Source: ${this.url}
|
|
|
|
| 364 |
}
|
| 365 |
|
| 366 |
const mixins = [];
|
| 367 |
+
if (this.description) {
|
| 368 |
+
mixins.push(`[${i + 1}] Description: ${this.description}`);
|
| 369 |
+
}
|
| 370 |
if (this.publishedTime) {
|
| 371 |
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
| 372 |
}
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -91,6 +91,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
| 91 |
in: 'header',
|
| 92 |
schema: { type: 'string' }
|
| 93 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
}
|
| 95 |
}
|
| 96 |
}
|
|
@@ -142,6 +147,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 142 |
@Prop()
|
| 143 |
userAgent?: string;
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
static override from(input: any) {
|
| 146 |
const instance = super.from(input) as CrawlerOptions;
|
| 147 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
|
@@ -178,6 +188,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 178 |
instance.cacheTolerance = cacheTolerance;
|
| 179 |
}
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
const targetSelector = ctx?.req.get('x-target-selector');
|
| 182 |
instance.targetSelector ??= targetSelector;
|
| 183 |
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
|
|
|
| 91 |
in: 'header',
|
| 92 |
schema: { type: 'string' }
|
| 93 |
},
|
| 94 |
+
'X-Timeout': {
|
| 95 |
+
description: `Specify timeout in seconds. Max 180.`,
|
| 96 |
+
in: 'header',
|
| 97 |
+
schema: { type: 'string' }
|
| 98 |
+
},
|
| 99 |
}
|
| 100 |
}
|
| 101 |
}
|
|
|
|
| 147 |
@Prop()
|
| 148 |
userAgent?: string;
|
| 149 |
|
| 150 |
+
@Prop({
|
| 151 |
+
validate: (v: number) => v > 0 && v <= 180,
|
| 152 |
+
})
|
| 153 |
+
timeout?: number;
|
| 154 |
+
|
| 155 |
static override from(input: any) {
|
| 156 |
const instance = super.from(input) as CrawlerOptions;
|
| 157 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
|
|
|
| 188 |
instance.cacheTolerance = cacheTolerance;
|
| 189 |
}
|
| 190 |
|
| 191 |
+
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
|
| 192 |
+
if (!isNaN(timeoutSeconds)) {
|
| 193 |
+
instance.timeout = timeoutSeconds;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
const targetSelector = ctx?.req.get('x-target-selector');
|
| 197 |
instance.targetSelector ??= targetSelector;
|
| 198 |
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -66,6 +66,7 @@ export interface ScrappingOptions {
|
|
| 66 |
waitForSelector?: string;
|
| 67 |
minIntervalMs?: number;
|
| 68 |
overrideUserAgent?: string;
|
|
|
|
| 69 |
}
|
| 70 |
|
| 71 |
|
|
@@ -449,7 +450,10 @@ document.addEventListener('load', handlePageLoad);
|
|
| 449 |
);
|
| 450 |
});
|
| 451 |
|
| 452 |
-
const gotoPromise = page.goto(url, {
|
|
|
|
|
|
|
|
|
|
| 453 |
.catch((err) => {
|
| 454 |
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
| 455 |
return Promise.reject(new AssertionFailureError({
|
|
|
|
| 66 |
waitForSelector?: string;
|
| 67 |
minIntervalMs?: number;
|
| 68 |
overrideUserAgent?: string;
|
| 69 |
+
timeoutMs?: number;
|
| 70 |
}
|
| 71 |
|
| 72 |
|
|
|
|
| 450 |
);
|
| 451 |
});
|
| 452 |
|
| 453 |
+
const gotoPromise = page.goto(url, {
|
| 454 |
+
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
| 455 |
+
timeout: options?.timeoutMs || 30_000
|
| 456 |
+
})
|
| 457 |
.catch((err) => {
|
| 458 |
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
| 459 |
return Promise.reject(new AssertionFailureError({
|