Spaces:
Build error
Build error
fix: timeout respect
Browse files- src/api/crawler.ts +8 -4
- src/api/searcher-serper.ts +0 -5
- src/dto/crawler-options.ts +4 -1
- src/services/puppeteer.ts +51 -418
src/api/crawler.ts
CHANGED
|
@@ -116,6 +116,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 116 |
if (snapshot.isIntermediate) {
|
| 117 |
return;
|
| 118 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
if (options.locale) {
|
| 120 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 121 |
}
|
|
@@ -313,7 +317,6 @@ export class CrawlerHost extends RPCHost {
|
|
| 313 |
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 314 |
}
|
| 315 |
}
|
| 316 |
-
|
| 317 |
const crawlOpts = await this.configure(crawlerOptions);
|
| 318 |
if (crawlerOptions.robotsTxt) {
|
| 319 |
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
|
@@ -461,7 +464,6 @@ export class CrawlerHost extends RPCHost {
|
|
| 461 |
}
|
| 462 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 463 |
}
|
| 464 |
-
|
| 465 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 466 |
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
| 467 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
|
@@ -798,6 +800,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 798 |
}
|
| 799 |
|
| 800 |
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
|
|
|
|
|
|
|
| 801 |
try {
|
| 802 |
const altOpts = { ...crawlOpts };
|
| 803 |
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
|
@@ -832,7 +836,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 832 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 833 |
draftSnapshot.title ??= analyzed.title;
|
| 834 |
draftSnapshot.isIntermediate = true;
|
| 835 |
-
if (
|
| 836 |
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
| 837 |
}
|
| 838 |
let fallbackProxyIsUsed = false;
|
|
@@ -858,7 +862,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 858 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 859 |
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 860 |
proxySnapshot.isIntermediate = true;
|
| 861 |
-
if (
|
| 862 |
yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
|
| 863 |
}
|
| 864 |
sideLoaded = proxyLoaded;
|
|
|
|
| 116 |
if (snapshot.isIntermediate) {
|
| 117 |
return;
|
| 118 |
}
|
| 119 |
+
if (!snapshot.lastMutationIdle) {
|
| 120 |
+
// Never reached mutationIdle, presumably too short timeout
|
| 121 |
+
return;
|
| 122 |
+
}
|
| 123 |
if (options.locale) {
|
| 124 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 125 |
}
|
|
|
|
| 317 |
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
| 318 |
}
|
| 319 |
}
|
|
|
|
| 320 |
const crawlOpts = await this.configure(crawlerOptions);
|
| 321 |
if (crawlerOptions.robotsTxt) {
|
| 322 |
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
|
|
|
| 464 |
}
|
| 465 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 466 |
}
|
|
|
|
| 467 |
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 468 |
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
| 469 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
|
|
|
| 800 |
}
|
| 801 |
|
| 802 |
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
|
| 803 |
+
const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() &&
|
| 804 |
+
[RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming);
|
| 805 |
try {
|
| 806 |
const altOpts = { ...crawlOpts };
|
| 807 |
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
|
|
|
| 836 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 837 |
draftSnapshot.title ??= analyzed.title;
|
| 838 |
draftSnapshot.isIntermediate = true;
|
| 839 |
+
if (sideLoadSnapshotPermitted) {
|
| 840 |
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
| 841 |
}
|
| 842 |
let fallbackProxyIsUsed = false;
|
|
|
|
| 862 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 863 |
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 864 |
proxySnapshot.isIntermediate = true;
|
| 865 |
+
if (sideLoadSnapshotPermitted) {
|
| 866 |
yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
|
| 867 |
}
|
| 868 |
sideLoaded = proxyLoaded;
|
src/api/searcher-serper.ts
CHANGED
|
@@ -318,11 +318,6 @@ export class SearcherHost extends RPCHost {
|
|
| 318 |
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
| 319 |
}
|
| 320 |
|
| 321 |
-
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
|
| 322 |
-
delete crawlOpts.timeoutMs;
|
| 323 |
-
}
|
| 324 |
-
|
| 325 |
-
|
| 326 |
let lastScrapped: any[] | undefined;
|
| 327 |
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
| 328 |
const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
|
|
|
|
| 318 |
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
| 319 |
}
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
let lastScrapped: any[] | undefined;
|
| 322 |
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
| 323 |
const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
|
src/dto/crawler-options.ts
CHANGED
|
@@ -655,8 +655,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 655 |
if (this.respondWith.includes('lm')) {
|
| 656 |
return false;
|
| 657 |
}
|
|
|
|
|
|
|
|
|
|
| 658 |
|
| 659 |
-
return
|
| 660 |
}
|
| 661 |
|
| 662 |
isCacheQueryApplicable() {
|
|
|
|
| 655 |
if (this.respondWith.includes('lm')) {
|
| 656 |
return false;
|
| 657 |
}
|
| 658 |
+
if (this.withIframe) {
|
| 659 |
+
return false;
|
| 660 |
+
}
|
| 661 |
|
| 662 |
+
return !snapshot.isIntermediate;
|
| 663 |
}
|
| 664 |
|
| 665 |
isCacheQueryApplicable() {
|
src/services/puppeteer.ts
CHANGED
|
@@ -846,7 +846,6 @@ export class PuppeteerControl extends AsyncService {
|
|
| 846 |
async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
|
| 847 |
// parsedUrl.search = '';
|
| 848 |
const url = parsedUrl.toString();
|
| 849 |
-
|
| 850 |
let snapshot: PageSnapshot | undefined;
|
| 851 |
let screenshot: Buffer | undefined;
|
| 852 |
let pageshot: Buffer | undefined;
|
|
@@ -1097,7 +1096,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1097 |
nextSnapshotDeferred.promise.finally(() => {
|
| 1098 |
this.off('crippled', crippleListener);
|
| 1099 |
});
|
| 1100 |
-
let
|
| 1101 |
const hdl = (s: any) => {
|
| 1102 |
if (snapshot === s) {
|
| 1103 |
return;
|
|
@@ -1143,6 +1142,39 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1143 |
goToOptions.referer = options.referer;
|
| 1144 |
}
|
| 1145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1146 |
const delayPromise = delay(timeout);
|
| 1147 |
const gotoPromise = page.goto(url, goToOptions)
|
| 1148 |
.catch((err) => {
|
|
@@ -1170,50 +1202,14 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1170 |
// Calling evaluate directly may stall the process.
|
| 1171 |
if (!snapshot) {
|
| 1172 |
if (stuff instanceof Error) {
|
| 1173 |
-
finalized = true;
|
| 1174 |
throw stuff;
|
| 1175 |
}
|
| 1176 |
}
|
| 1177 |
await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
|
| 1178 |
.catch(() => void 0);
|
| 1179 |
-
|
| 1180 |
-
|
| 1181 |
-
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1182 |
-
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
| 1183 |
-
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
| 1184 |
-
if (snapshot) {
|
| 1185 |
-
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1186 |
-
}
|
| 1187 |
-
} catch (err: any) {
|
| 1188 |
-
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
| 1189 |
-
if (stuff instanceof Error) {
|
| 1190 |
-
finalized = true;
|
| 1191 |
-
throw stuff;
|
| 1192 |
-
}
|
| 1193 |
-
}
|
| 1194 |
-
if (!snapshot?.html) {
|
| 1195 |
-
if (stuff instanceof Error) {
|
| 1196 |
-
finalized = true;
|
| 1197 |
-
throw stuff;
|
| 1198 |
-
}
|
| 1199 |
-
}
|
| 1200 |
-
|
| 1201 |
-
finalized = true;
|
| 1202 |
-
if (snapshot?.html) {
|
| 1203 |
-
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 1204 |
-
this.emit(
|
| 1205 |
-
'crawled',
|
| 1206 |
-
{
|
| 1207 |
-
...snapshot,
|
| 1208 |
-
status: navigationResponse?.status(),
|
| 1209 |
-
statusText: navigationResponse?.statusText(),
|
| 1210 |
-
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
| 1211 |
-
},
|
| 1212 |
-
{ ...options, url: parsedUrl }
|
| 1213 |
-
);
|
| 1214 |
-
}
|
| 1215 |
});
|
| 1216 |
-
let waitForPromise: Promise<any> | undefined;
|
| 1217 |
if (options.waitForSelector) {
|
| 1218 |
const t0 = Date.now();
|
| 1219 |
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
|
@@ -1224,19 +1220,12 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1224 |
const p = (Array.isArray(options.waitForSelector) ?
|
| 1225 |
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
| 1226 |
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
| 1227 |
-
.then(
|
| 1228 |
-
|
| 1229 |
-
|
| 1230 |
-
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
| 1231 |
-
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
| 1232 |
-
if (snapshot) {
|
| 1233 |
-
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1234 |
-
}
|
| 1235 |
-
finalized = true;
|
| 1236 |
})
|
| 1237 |
.catch((err) => {
|
| 1238 |
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
|
| 1239 |
-
waitForPromise = undefined;
|
| 1240 |
});
|
| 1241 |
return p as any;
|
| 1242 |
});
|
|
@@ -1254,11 +1243,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1254 |
}
|
| 1255 |
let error;
|
| 1256 |
await Promise.race(ckpt).catch((err) => error = err);
|
| 1257 |
-
if (
|
| 1258 |
if (!snapshot && !screenshot) {
|
| 1259 |
-
if (error) {
|
| 1260 |
-
throw error;
|
| 1261 |
-
}
|
| 1262 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 1263 |
}
|
| 1264 |
yield {
|
|
@@ -1286,10 +1272,20 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1286 |
if (error) {
|
| 1287 |
throw error;
|
| 1288 |
}
|
|
|
|
|
|
|
|
|
|
| 1289 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1290 |
} finally {
|
| 1291 |
this.pagePhase.set(page, 'background');
|
| 1292 |
-
|
| 1293 |
page.off('snapshot', hdl);
|
| 1294 |
this.ditchPage(page);
|
| 1295 |
});
|
|
@@ -1329,369 +1325,6 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1329 |
return r.filter(Boolean);
|
| 1330 |
}
|
| 1331 |
|
| 1332 |
-
async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise<PageSnapshot> {
|
| 1333 |
-
// parsedUrl.search = '';
|
| 1334 |
-
const url = parsedUrl.toString();
|
| 1335 |
-
let snapshot: PageSnapshot | undefined;
|
| 1336 |
-
let navigationResponse: HTTPResponse | undefined;
|
| 1337 |
-
const page = await this.getNextPage();
|
| 1338 |
-
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
| 1339 |
-
this.pagePhase.set(page, 'active');
|
| 1340 |
-
page.on('response', (resp) => {
|
| 1341 |
-
this.blackHoleDetector.itWorked();
|
| 1342 |
-
const req = resp.request();
|
| 1343 |
-
if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
|
| 1344 |
-
navigationResponse = resp;
|
| 1345 |
-
}
|
| 1346 |
-
if (!resp.ok()) {
|
| 1347 |
-
return;
|
| 1348 |
-
}
|
| 1349 |
-
});
|
| 1350 |
-
page.on('request', async (req) => {
|
| 1351 |
-
if (req.isInterceptResolutionHandled()) {
|
| 1352 |
-
return;
|
| 1353 |
-
};
|
| 1354 |
-
const reqUrlParsed = new URL(req.url());
|
| 1355 |
-
if (!reqUrlParsed.protocol.startsWith('http')) {
|
| 1356 |
-
const overrides = req.continueRequestOverrides();
|
| 1357 |
-
|
| 1358 |
-
return req.continue(overrides, 0);
|
| 1359 |
-
}
|
| 1360 |
-
const typ = req.resourceType();
|
| 1361 |
-
if (typ === 'media') {
|
| 1362 |
-
// Non-cooperative answer to block all media requests.
|
| 1363 |
-
return req.abort('blockedbyclient');
|
| 1364 |
-
}
|
| 1365 |
-
if (!options.proxyResources) {
|
| 1366 |
-
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
| 1367 |
-
if (!isDocRequest) {
|
| 1368 |
-
if (options.extraHeaders) {
|
| 1369 |
-
const overrides = req.continueRequestOverrides();
|
| 1370 |
-
const continueArgs = [{
|
| 1371 |
-
...overrides,
|
| 1372 |
-
headers: {
|
| 1373 |
-
...req.headers(),
|
| 1374 |
-
...overrides?.headers,
|
| 1375 |
-
...options.extraHeaders,
|
| 1376 |
-
}
|
| 1377 |
-
}, 1] as const;
|
| 1378 |
-
|
| 1379 |
-
return req.continue(continueArgs[0], continueArgs[1]);
|
| 1380 |
-
}
|
| 1381 |
-
const overrides = req.continueRequestOverrides();
|
| 1382 |
-
|
| 1383 |
-
return req.continue(overrides, 0);
|
| 1384 |
-
}
|
| 1385 |
-
}
|
| 1386 |
-
const sideload = options.sideLoad;
|
| 1387 |
-
|
| 1388 |
-
const impersonate = sideload?.impersonate[reqUrlParsed.href];
|
| 1389 |
-
if (impersonate) {
|
| 1390 |
-
let body;
|
| 1391 |
-
if (impersonate.body) {
|
| 1392 |
-
body = await readFile(await impersonate.body.filePath);
|
| 1393 |
-
if (req.isInterceptResolutionHandled()) {
|
| 1394 |
-
return;
|
| 1395 |
-
}
|
| 1396 |
-
}
|
| 1397 |
-
return req.respond({
|
| 1398 |
-
status: impersonate.status,
|
| 1399 |
-
headers: impersonate.headers,
|
| 1400 |
-
contentType: impersonate.contentType,
|
| 1401 |
-
body: body ? Uint8Array.from(body) : undefined,
|
| 1402 |
-
}, 999);
|
| 1403 |
-
}
|
| 1404 |
-
|
| 1405 |
-
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
| 1406 |
-
const ctx = this.lifeCycleTrack.get(page);
|
| 1407 |
-
if (proxy && ctx) {
|
| 1408 |
-
return await this.asyncLocalContext.bridge(ctx, async () => {
|
| 1409 |
-
try {
|
| 1410 |
-
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
| 1411 |
-
...options,
|
| 1412 |
-
method: req.method(),
|
| 1413 |
-
body: req.postData(),
|
| 1414 |
-
extraHeaders: {
|
| 1415 |
-
...req.headers(),
|
| 1416 |
-
...options.extraHeaders,
|
| 1417 |
-
},
|
| 1418 |
-
proxyUrl: proxy
|
| 1419 |
-
});
|
| 1420 |
-
if (req.isInterceptResolutionHandled()) {
|
| 1421 |
-
return;
|
| 1422 |
-
};
|
| 1423 |
-
|
| 1424 |
-
if (curled.chain.length === 1) {
|
| 1425 |
-
if (!curled.file) {
|
| 1426 |
-
return req.respond({
|
| 1427 |
-
status: curled.status,
|
| 1428 |
-
headers: _.omit(curled.headers, 'result'),
|
| 1429 |
-
contentType: curled.contentType,
|
| 1430 |
-
}, 3);
|
| 1431 |
-
}
|
| 1432 |
-
const body = await readFile(await curled.file.filePath);
|
| 1433 |
-
if (req.isInterceptResolutionHandled()) {
|
| 1434 |
-
return;
|
| 1435 |
-
};
|
| 1436 |
-
return req.respond({
|
| 1437 |
-
status: curled.status,
|
| 1438 |
-
headers: _.omit(curled.headers, 'result'),
|
| 1439 |
-
contentType: curled.contentType,
|
| 1440 |
-
body: Uint8Array.from(body),
|
| 1441 |
-
}, 3);
|
| 1442 |
-
}
|
| 1443 |
-
options.sideLoad ??= curled.sideLoadOpts;
|
| 1444 |
-
_.merge(options.sideLoad, curled.sideLoadOpts);
|
| 1445 |
-
const firstReq = curled.chain[0];
|
| 1446 |
-
|
| 1447 |
-
return req.respond({
|
| 1448 |
-
status: firstReq.result!.code,
|
| 1449 |
-
headers: _.omit(firstReq, 'result'),
|
| 1450 |
-
}, 3);
|
| 1451 |
-
} catch (err: any) {
|
| 1452 |
-
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
| 1453 |
-
}
|
| 1454 |
-
if (req.isInterceptResolutionHandled()) {
|
| 1455 |
-
return;
|
| 1456 |
-
};
|
| 1457 |
-
const overrides = req.continueRequestOverrides();
|
| 1458 |
-
const continueArgs = [{
|
| 1459 |
-
...overrides,
|
| 1460 |
-
headers: {
|
| 1461 |
-
...req.headers(),
|
| 1462 |
-
...overrides?.headers,
|
| 1463 |
-
...options.extraHeaders,
|
| 1464 |
-
}
|
| 1465 |
-
}, 1] as const;
|
| 1466 |
-
|
| 1467 |
-
return req.continue(continueArgs[0], continueArgs[1]);
|
| 1468 |
-
});
|
| 1469 |
-
}
|
| 1470 |
-
|
| 1471 |
-
if (req.isInterceptResolutionHandled()) {
|
| 1472 |
-
return;
|
| 1473 |
-
};
|
| 1474 |
-
const overrides = req.continueRequestOverrides();
|
| 1475 |
-
const continueArgs = [{
|
| 1476 |
-
...overrides,
|
| 1477 |
-
headers: {
|
| 1478 |
-
...req.headers(),
|
| 1479 |
-
...overrides?.headers,
|
| 1480 |
-
...options.extraHeaders,
|
| 1481 |
-
}
|
| 1482 |
-
}, 1] as const;
|
| 1483 |
-
|
| 1484 |
-
return req.continue(continueArgs[0], continueArgs[1]);
|
| 1485 |
-
});
|
| 1486 |
-
|
| 1487 |
-
const sn = this.snMap.get(page);
|
| 1488 |
-
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 1489 |
-
if (options.locale) {
|
| 1490 |
-
// Add headers via request interception to walk around this bug
|
| 1491 |
-
// https://github.com/puppeteer/puppeteer/issues/10235
|
| 1492 |
-
// await page.setExtraHTTPHeaders({
|
| 1493 |
-
// 'Accept-Language': options.locale
|
| 1494 |
-
// });
|
| 1495 |
-
|
| 1496 |
-
await page.evaluateOnNewDocument(() => {
|
| 1497 |
-
Object.defineProperty(navigator, "language", {
|
| 1498 |
-
get: function () {
|
| 1499 |
-
return options.locale;
|
| 1500 |
-
}
|
| 1501 |
-
});
|
| 1502 |
-
Object.defineProperty(navigator, "languages", {
|
| 1503 |
-
get: function () {
|
| 1504 |
-
return [options.locale];
|
| 1505 |
-
}
|
| 1506 |
-
});
|
| 1507 |
-
});
|
| 1508 |
-
}
|
| 1509 |
-
|
| 1510 |
-
if (options.cookies) {
|
| 1511 |
-
const mapped = options.cookies.map((x) => {
|
| 1512 |
-
const draft: CookieParam = {
|
| 1513 |
-
name: x.name,
|
| 1514 |
-
value: encodeURIComponent(x.value),
|
| 1515 |
-
secure: x.secure,
|
| 1516 |
-
domain: x.domain,
|
| 1517 |
-
path: x.path,
|
| 1518 |
-
expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
|
| 1519 |
-
sameSite: x.sameSite as any,
|
| 1520 |
-
};
|
| 1521 |
-
if (!draft.expires && x.maxAge) {
|
| 1522 |
-
draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
|
| 1523 |
-
}
|
| 1524 |
-
if (!draft.domain) {
|
| 1525 |
-
draft.url = parsedUrl.toString();
|
| 1526 |
-
}
|
| 1527 |
-
|
| 1528 |
-
return draft;
|
| 1529 |
-
});
|
| 1530 |
-
try {
|
| 1531 |
-
await page.setCookie(...mapped);
|
| 1532 |
-
} catch (err: any) {
|
| 1533 |
-
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
|
| 1534 |
-
throw new ParamValidationError({
|
| 1535 |
-
path: 'cookies',
|
| 1536 |
-
message: `Failed to set cookies: ${err?.message}`
|
| 1537 |
-
});
|
| 1538 |
-
}
|
| 1539 |
-
}
|
| 1540 |
-
if (options.overrideUserAgent) {
|
| 1541 |
-
await page.setUserAgent(options.overrideUserAgent);
|
| 1542 |
-
}
|
| 1543 |
-
if (options.viewport) {
|
| 1544 |
-
await page.setViewport(options.viewport);
|
| 1545 |
-
}
|
| 1546 |
-
|
| 1547 |
-
let nextSnapshotDeferred = Defer();
|
| 1548 |
-
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
|
| 1549 |
-
this.once('crippled', crippleListener);
|
| 1550 |
-
nextSnapshotDeferred.promise.finally(() => {
|
| 1551 |
-
this.off('crippled', crippleListener);
|
| 1552 |
-
});
|
| 1553 |
-
let finalized = false;
|
| 1554 |
-
const hdl = (s: any) => {
|
| 1555 |
-
if (snapshot === s) {
|
| 1556 |
-
return;
|
| 1557 |
-
}
|
| 1558 |
-
snapshot = s;
|
| 1559 |
-
if (snapshot) {
|
| 1560 |
-
const kit = this.pageReqCtrl.get(page);
|
| 1561 |
-
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
|
| 1562 |
-
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
|
| 1563 |
-
}
|
| 1564 |
-
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
| 1565 |
-
return;
|
| 1566 |
-
}
|
| 1567 |
-
if (s?.elemCount && s.elemCount > 10_000) {
|
| 1568 |
-
return;
|
| 1569 |
-
}
|
| 1570 |
-
nextSnapshotDeferred.resolve(s);
|
| 1571 |
-
nextSnapshotDeferred = Defer();
|
| 1572 |
-
this.once('crippled', crippleListener);
|
| 1573 |
-
nextSnapshotDeferred.promise.finally(() => {
|
| 1574 |
-
this.off('crippled', crippleListener);
|
| 1575 |
-
});
|
| 1576 |
-
};
|
| 1577 |
-
page.on('snapshot', hdl);
|
| 1578 |
-
page.once('abuse', (event: any) => {
|
| 1579 |
-
this.emit('abuse', { ...event, url: parsedUrl });
|
| 1580 |
-
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
|
| 1581 |
-
this.emit('abuse', { ...event, url: snapshot.href });
|
| 1582 |
-
}
|
| 1583 |
-
|
| 1584 |
-
nextSnapshotDeferred.reject(
|
| 1585 |
-
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
| 1586 |
-
);
|
| 1587 |
-
});
|
| 1588 |
-
|
| 1589 |
-
const timeout = options.timeoutMs || 30_000;
|
| 1590 |
-
const goToOptions: GoToOptions = {
|
| 1591 |
-
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
| 1592 |
-
timeout,
|
| 1593 |
-
};
|
| 1594 |
-
|
| 1595 |
-
if (options.referer) {
|
| 1596 |
-
goToOptions.referer = options.referer;
|
| 1597 |
-
}
|
| 1598 |
-
|
| 1599 |
-
const gotoPromise = page.goto(url, goToOptions)
|
| 1600 |
-
.catch((err) => {
|
| 1601 |
-
if (err instanceof TimeoutError) {
|
| 1602 |
-
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
|
| 1603 |
-
return new AssertionFailureError({
|
| 1604 |
-
message: `Failed to goto ${url}: ${err}`,
|
| 1605 |
-
cause: err,
|
| 1606 |
-
});
|
| 1607 |
-
}
|
| 1608 |
-
|
| 1609 |
-
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
|
| 1610 |
-
return new AssertionFailureError({
|
| 1611 |
-
message: `Failed to goto ${url}: ${err}`,
|
| 1612 |
-
cause: err,
|
| 1613 |
-
});
|
| 1614 |
-
}).then(async (stuff) => {
|
| 1615 |
-
// This check is necessary because without snapshot, the condition of the page is unclear
|
| 1616 |
-
// Calling evaluate directly may stall the process.
|
| 1617 |
-
if (!snapshot) {
|
| 1618 |
-
if (stuff instanceof Error) {
|
| 1619 |
-
finalized = true;
|
| 1620 |
-
throw stuff;
|
| 1621 |
-
}
|
| 1622 |
-
}
|
| 1623 |
-
try {
|
| 1624 |
-
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1625 |
-
} catch (err: any) {
|
| 1626 |
-
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
| 1627 |
-
if (stuff instanceof Error) {
|
| 1628 |
-
finalized = true;
|
| 1629 |
-
throw stuff;
|
| 1630 |
-
}
|
| 1631 |
-
}
|
| 1632 |
-
if (!snapshot?.html) {
|
| 1633 |
-
if (stuff instanceof Error) {
|
| 1634 |
-
finalized = true;
|
| 1635 |
-
throw stuff;
|
| 1636 |
-
}
|
| 1637 |
-
}
|
| 1638 |
-
|
| 1639 |
-
finalized = true;
|
| 1640 |
-
if (snapshot?.html) {
|
| 1641 |
-
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 1642 |
-
this.emit(
|
| 1643 |
-
'crawled',
|
| 1644 |
-
{
|
| 1645 |
-
...snapshot,
|
| 1646 |
-
status: navigationResponse?.status(),
|
| 1647 |
-
statusText: navigationResponse?.statusText(),
|
| 1648 |
-
},
|
| 1649 |
-
{ ...options, url: parsedUrl }
|
| 1650 |
-
);
|
| 1651 |
-
}
|
| 1652 |
-
});
|
| 1653 |
-
|
| 1654 |
-
try {
|
| 1655 |
-
while (true) {
|
| 1656 |
-
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
| 1657 |
-
if (options.minIntervalMs) {
|
| 1658 |
-
ckpt.push(delay(options.minIntervalMs));
|
| 1659 |
-
}
|
| 1660 |
-
let error;
|
| 1661 |
-
await Promise.race(ckpt).catch((err) => error = err);
|
| 1662 |
-
if (finalized && !error) {
|
| 1663 |
-
if (!snapshot) {
|
| 1664 |
-
if (error) {
|
| 1665 |
-
throw error;
|
| 1666 |
-
}
|
| 1667 |
-
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 1668 |
-
}
|
| 1669 |
-
return {
|
| 1670 |
-
...snapshot,
|
| 1671 |
-
status: navigationResponse?.status(),
|
| 1672 |
-
statusText: navigationResponse?.statusText(),
|
| 1673 |
-
} as PageSnapshot;
|
| 1674 |
-
}
|
| 1675 |
-
|
| 1676 |
-
if (snapshot?.lastMutationIdle) {
|
| 1677 |
-
return {
|
| 1678 |
-
...snapshot,
|
| 1679 |
-
status: navigationResponse?.status(),
|
| 1680 |
-
statusText: navigationResponse?.statusText(),
|
| 1681 |
-
} as PageSnapshot;
|
| 1682 |
-
}
|
| 1683 |
-
if (error) {
|
| 1684 |
-
throw error;
|
| 1685 |
-
}
|
| 1686 |
-
}
|
| 1687 |
-
} finally {
|
| 1688 |
-
this.pagePhase.set(page, 'background');
|
| 1689 |
-
page.off('snapshot', hdl);
|
| 1690 |
-
this.ditchPage(page);
|
| 1691 |
-
nextSnapshotDeferred.resolve();
|
| 1692 |
-
}
|
| 1693 |
-
}
|
| 1694 |
-
|
| 1695 |
}
|
| 1696 |
|
| 1697 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
|
|
|
| 846 |
async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
|
| 847 |
// parsedUrl.search = '';
|
| 848 |
const url = parsedUrl.toString();
|
|
|
|
| 849 |
let snapshot: PageSnapshot | undefined;
|
| 850 |
let screenshot: Buffer | undefined;
|
| 851 |
let pageshot: Buffer | undefined;
|
|
|
|
| 1096 |
nextSnapshotDeferred.promise.finally(() => {
|
| 1097 |
this.off('crippled', crippleListener);
|
| 1098 |
});
|
| 1099 |
+
let successfullyDone = false;
|
| 1100 |
const hdl = (s: any) => {
|
| 1101 |
if (snapshot === s) {
|
| 1102 |
return;
|
|
|
|
| 1142 |
goToOptions.referer = options.referer;
|
| 1143 |
}
|
| 1144 |
|
| 1145 |
+
let waitForPromise: Promise<any> | undefined;
|
| 1146 |
+
let finalizationPromise: Promise<any> | undefined;
|
| 1147 |
+
const doFinalization = async () => {
|
| 1148 |
+
if (!waitForPromise) {
|
| 1149 |
+
successfullyDone = true;
|
| 1150 |
+
}
|
| 1151 |
+
try {
|
| 1152 |
+
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 1153 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1154 |
+
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
| 1155 |
+
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
| 1156 |
+
if (snapshot) {
|
| 1157 |
+
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1158 |
+
}
|
| 1159 |
+
} catch (err: any) {
|
| 1160 |
+
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
| 1161 |
+
}
|
| 1162 |
+
if (!snapshot?.html) {
|
| 1163 |
+
return;
|
| 1164 |
+
}
|
| 1165 |
+
|
| 1166 |
+
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 1167 |
+
this.emit(
|
| 1168 |
+
'crawled',
|
| 1169 |
+
{
|
| 1170 |
+
...snapshot,
|
| 1171 |
+
status: navigationResponse?.status(),
|
| 1172 |
+
statusText: navigationResponse?.statusText(),
|
| 1173 |
+
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
| 1174 |
+
},
|
| 1175 |
+
{ ...options, url: parsedUrl }
|
| 1176 |
+
);
|
| 1177 |
+
};
|
| 1178 |
const delayPromise = delay(timeout);
|
| 1179 |
const gotoPromise = page.goto(url, goToOptions)
|
| 1180 |
.catch((err) => {
|
|
|
|
| 1202 |
// Calling evaluate directly may stall the process.
|
| 1203 |
if (!snapshot) {
|
| 1204 |
if (stuff instanceof Error) {
|
|
|
|
| 1205 |
throw stuff;
|
| 1206 |
}
|
| 1207 |
}
|
| 1208 |
await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
|
| 1209 |
.catch(() => void 0);
|
| 1210 |
+
finalizationPromise = doFinalization();
|
| 1211 |
+
return stuff;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1212 |
});
|
|
|
|
| 1213 |
if (options.waitForSelector) {
|
| 1214 |
const t0 = Date.now();
|
| 1215 |
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
|
|
|
| 1220 |
const p = (Array.isArray(options.waitForSelector) ?
|
| 1221 |
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
| 1222 |
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
| 1223 |
+
.then(() => {
|
| 1224 |
+
successfullyDone = true;
|
| 1225 |
+
finalizationPromise = doFinalization();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1226 |
})
|
| 1227 |
.catch((err) => {
|
| 1228 |
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
|
|
|
|
| 1229 |
});
|
| 1230 |
return p as any;
|
| 1231 |
});
|
|
|
|
| 1243 |
}
|
| 1244 |
let error;
|
| 1245 |
await Promise.race(ckpt).catch((err) => error = err);
|
| 1246 |
+
if (successfullyDone && !error) {
|
| 1247 |
if (!snapshot && !screenshot) {
|
|
|
|
|
|
|
|
|
|
| 1248 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 1249 |
}
|
| 1250 |
yield {
|
|
|
|
| 1272 |
if (error) {
|
| 1273 |
throw error;
|
| 1274 |
}
|
| 1275 |
+
if (successfullyDone) {
|
| 1276 |
+
break;
|
| 1277 |
+
}
|
| 1278 |
}
|
| 1279 |
+
await finalizationPromise;
|
| 1280 |
+
yield {
|
| 1281 |
+
...snapshot,
|
| 1282 |
+
status: navigationResponse?.status(),
|
| 1283 |
+
statusText: navigationResponse?.statusText(),
|
| 1284 |
+
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
| 1285 |
+
} as PageSnapshot;
|
| 1286 |
} finally {
|
| 1287 |
this.pagePhase.set(page, 'background');
|
| 1288 |
+
Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => {
|
| 1289 |
page.off('snapshot', hdl);
|
| 1290 |
this.ditchPage(page);
|
| 1291 |
});
|
|
|
|
| 1325 |
return r.filter(Boolean);
|
| 1326 |
}
|
| 1327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1328 |
}
|
| 1329 |
|
| 1330 |
const puppeteerControl = container.resolve(PuppeteerControl);
|