nomagick commited on
Commit
d2afa9d
·
unverified ·
1 Parent(s): c795cdb

fix: timeout respect

Browse files
src/api/crawler.ts CHANGED
@@ -116,6 +116,10 @@ export class CrawlerHost extends RPCHost {
116
  if (snapshot.isIntermediate) {
117
  return;
118
  }
 
 
 
 
119
  if (options.locale) {
120
  Reflect.set(snapshot, 'locale', options.locale);
121
  }
@@ -313,7 +317,6 @@ export class CrawlerHost extends RPCHost {
313
  throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
314
  }
315
  }
316
-
317
  const crawlOpts = await this.configure(crawlerOptions);
318
  if (crawlerOptions.robotsTxt) {
319
  await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
@@ -461,7 +464,6 @@ export class CrawlerHost extends RPCHost {
461
  }
462
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
463
  }
464
-
465
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
466
  chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
467
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
@@ -798,6 +800,8 @@ export class CrawlerHost extends RPCHost {
798
  }
799
 
800
  if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
 
 
801
  try {
802
  const altOpts = { ...crawlOpts };
803
  let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
@@ -832,7 +836,7 @@ export class CrawlerHost extends RPCHost {
832
  let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
833
  draftSnapshot.title ??= analyzed.title;
834
  draftSnapshot.isIntermediate = true;
835
- if (crawlerOpts?.browserIsNotRequired()) {
836
  yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
837
  }
838
  let fallbackProxyIsUsed = false;
@@ -858,7 +862,7 @@ export class CrawlerHost extends RPCHost {
858
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
859
  if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
860
  proxySnapshot.isIntermediate = true;
861
- if (crawlerOpts?.browserIsNotRequired()) {
862
  yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
863
  }
864
  sideLoaded = proxyLoaded;
 
116
  if (snapshot.isIntermediate) {
117
  return;
118
  }
119
+ if (!snapshot.lastMutationIdle) {
120
+ // Never reached mutationIdle, presumably too short timeout
121
+ return;
122
+ }
123
  if (options.locale) {
124
  Reflect.set(snapshot, 'locale', options.locale);
125
  }
 
317
  throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
318
  }
319
  }
 
320
  const crawlOpts = await this.configure(crawlerOptions);
321
  if (crawlerOptions.robotsTxt) {
322
  await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
 
464
  }
465
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
466
  }
 
467
  const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
468
  chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
469
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
 
800
  }
801
 
802
  if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
803
+ const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() &&
804
+ [RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming);
805
  try {
806
  const altOpts = { ...crawlOpts };
807
  let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
 
836
  let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
837
  draftSnapshot.title ??= analyzed.title;
838
  draftSnapshot.isIntermediate = true;
839
+ if (sideLoadSnapshotPermitted) {
840
  yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
841
  }
842
  let fallbackProxyIsUsed = false;
 
862
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
863
  if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
864
  proxySnapshot.isIntermediate = true;
865
+ if (sideLoadSnapshotPermitted) {
866
  yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
867
  }
868
  sideLoaded = proxyLoaded;
src/api/searcher-serper.ts CHANGED
@@ -318,11 +318,6 @@ export class SearcherHost extends RPCHost {
318
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
319
  }
320
 
321
- if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
322
- delete crawlOpts.timeoutMs;
323
- }
324
-
325
-
326
  let lastScrapped: any[] | undefined;
327
  const targetResultCount = crawlWithoutContent ? count : count + 2;
328
  const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
 
318
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
319
  }
320
 
 
 
 
 
 
321
  let lastScrapped: any[] | undefined;
322
  const targetResultCount = crawlWithoutContent ? count : count + 2;
323
  const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
src/dto/crawler-options.ts CHANGED
@@ -655,8 +655,11 @@ export class CrawlerOptions extends AutoCastable {
655
  if (this.respondWith.includes('lm')) {
656
  return false;
657
  }
 
 
 
658
 
659
- return false;
660
  }
661
 
662
  isCacheQueryApplicable() {
 
655
  if (this.respondWith.includes('lm')) {
656
  return false;
657
  }
658
+ if (this.withIframe) {
659
+ return false;
660
+ }
661
 
662
+ return !snapshot.isIntermediate;
663
  }
664
 
665
  isCacheQueryApplicable() {
src/services/puppeteer.ts CHANGED
@@ -846,7 +846,6 @@ export class PuppeteerControl extends AsyncService {
846
  async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
847
  // parsedUrl.search = '';
848
  const url = parsedUrl.toString();
849
-
850
  let snapshot: PageSnapshot | undefined;
851
  let screenshot: Buffer | undefined;
852
  let pageshot: Buffer | undefined;
@@ -1097,7 +1096,7 @@ export class PuppeteerControl extends AsyncService {
1097
  nextSnapshotDeferred.promise.finally(() => {
1098
  this.off('crippled', crippleListener);
1099
  });
1100
- let finalized = false;
1101
  const hdl = (s: any) => {
1102
  if (snapshot === s) {
1103
  return;
@@ -1143,6 +1142,39 @@ export class PuppeteerControl extends AsyncService {
1143
  goToOptions.referer = options.referer;
1144
  }
1145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1146
  const delayPromise = delay(timeout);
1147
  const gotoPromise = page.goto(url, goToOptions)
1148
  .catch((err) => {
@@ -1170,50 +1202,14 @@ export class PuppeteerControl extends AsyncService {
1170
  // Calling evaluate directly may stall the process.
1171
  if (!snapshot) {
1172
  if (stuff instanceof Error) {
1173
- finalized = true;
1174
  throw stuff;
1175
  }
1176
  }
1177
  await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
1178
  .catch(() => void 0);
1179
- try {
1180
- const pSubFrameSnapshots = this.snapshotChildFrames(page);
1181
- snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1182
- screenshot = (await this.takeScreenShot(page)) || screenshot;
1183
- pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
1184
- if (snapshot) {
1185
- snapshot.childFrames = await pSubFrameSnapshots;
1186
- }
1187
- } catch (err: any) {
1188
- this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
1189
- if (stuff instanceof Error) {
1190
- finalized = true;
1191
- throw stuff;
1192
- }
1193
- }
1194
- if (!snapshot?.html) {
1195
- if (stuff instanceof Error) {
1196
- finalized = true;
1197
- throw stuff;
1198
- }
1199
- }
1200
-
1201
- finalized = true;
1202
- if (snapshot?.html) {
1203
- this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
1204
- this.emit(
1205
- 'crawled',
1206
- {
1207
- ...snapshot,
1208
- status: navigationResponse?.status(),
1209
- statusText: navigationResponse?.statusText(),
1210
- pdfs: _.uniq(pdfUrls), screenshot, pageshot,
1211
- },
1212
- { ...options, url: parsedUrl }
1213
- );
1214
- }
1215
  });
1216
- let waitForPromise: Promise<any> | undefined;
1217
  if (options.waitForSelector) {
1218
  const t0 = Date.now();
1219
  waitForPromise = nextSnapshotDeferred.promise.then(() => {
@@ -1224,19 +1220,12 @@ export class PuppeteerControl extends AsyncService {
1224
  const p = (Array.isArray(options.waitForSelector) ?
1225
  Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
1226
  page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
1227
- .then(async () => {
1228
- const pSubFrameSnapshots = this.snapshotChildFrames(page);
1229
- snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1230
- screenshot = (await this.takeScreenShot(page)) || screenshot;
1231
- pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
1232
- if (snapshot) {
1233
- snapshot.childFrames = await pSubFrameSnapshots;
1234
- }
1235
- finalized = true;
1236
  })
1237
  .catch((err) => {
1238
  this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
1239
- waitForPromise = undefined;
1240
  });
1241
  return p as any;
1242
  });
@@ -1254,11 +1243,8 @@ export class PuppeteerControl extends AsyncService {
1254
  }
1255
  let error;
1256
  await Promise.race(ckpt).catch((err) => error = err);
1257
- if (finalized && !error) {
1258
  if (!snapshot && !screenshot) {
1259
- if (error) {
1260
- throw error;
1261
- }
1262
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
1263
  }
1264
  yield {
@@ -1286,10 +1272,20 @@ export class PuppeteerControl extends AsyncService {
1286
  if (error) {
1287
  throw error;
1288
  }
 
 
 
1289
  }
 
 
 
 
 
 
 
1290
  } finally {
1291
  this.pagePhase.set(page, 'background');
1292
- (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
1293
  page.off('snapshot', hdl);
1294
  this.ditchPage(page);
1295
  });
@@ -1329,369 +1325,6 @@ export class PuppeteerControl extends AsyncService {
1329
  return r.filter(Boolean);
1330
  }
1331
 
1332
- async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise<PageSnapshot> {
1333
- // parsedUrl.search = '';
1334
- const url = parsedUrl.toString();
1335
- let snapshot: PageSnapshot | undefined;
1336
- let navigationResponse: HTTPResponse | undefined;
1337
- const page = await this.getNextPage();
1338
- this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
1339
- this.pagePhase.set(page, 'active');
1340
- page.on('response', (resp) => {
1341
- this.blackHoleDetector.itWorked();
1342
- const req = resp.request();
1343
- if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
1344
- navigationResponse = resp;
1345
- }
1346
- if (!resp.ok()) {
1347
- return;
1348
- }
1349
- });
1350
- page.on('request', async (req) => {
1351
- if (req.isInterceptResolutionHandled()) {
1352
- return;
1353
- };
1354
- const reqUrlParsed = new URL(req.url());
1355
- if (!reqUrlParsed.protocol.startsWith('http')) {
1356
- const overrides = req.continueRequestOverrides();
1357
-
1358
- return req.continue(overrides, 0);
1359
- }
1360
- const typ = req.resourceType();
1361
- if (typ === 'media') {
1362
- // Non-cooperative answer to block all media requests.
1363
- return req.abort('blockedbyclient');
1364
- }
1365
- if (!options.proxyResources) {
1366
- const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
1367
- if (!isDocRequest) {
1368
- if (options.extraHeaders) {
1369
- const overrides = req.continueRequestOverrides();
1370
- const continueArgs = [{
1371
- ...overrides,
1372
- headers: {
1373
- ...req.headers(),
1374
- ...overrides?.headers,
1375
- ...options.extraHeaders,
1376
- }
1377
- }, 1] as const;
1378
-
1379
- return req.continue(continueArgs[0], continueArgs[1]);
1380
- }
1381
- const overrides = req.continueRequestOverrides();
1382
-
1383
- return req.continue(overrides, 0);
1384
- }
1385
- }
1386
- const sideload = options.sideLoad;
1387
-
1388
- const impersonate = sideload?.impersonate[reqUrlParsed.href];
1389
- if (impersonate) {
1390
- let body;
1391
- if (impersonate.body) {
1392
- body = await readFile(await impersonate.body.filePath);
1393
- if (req.isInterceptResolutionHandled()) {
1394
- return;
1395
- }
1396
- }
1397
- return req.respond({
1398
- status: impersonate.status,
1399
- headers: impersonate.headers,
1400
- contentType: impersonate.contentType,
1401
- body: body ? Uint8Array.from(body) : undefined,
1402
- }, 999);
1403
- }
1404
-
1405
- const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
1406
- const ctx = this.lifeCycleTrack.get(page);
1407
- if (proxy && ctx) {
1408
- return await this.asyncLocalContext.bridge(ctx, async () => {
1409
- try {
1410
- const curled = await this.curlControl.sideLoad(reqUrlParsed, {
1411
- ...options,
1412
- method: req.method(),
1413
- body: req.postData(),
1414
- extraHeaders: {
1415
- ...req.headers(),
1416
- ...options.extraHeaders,
1417
- },
1418
- proxyUrl: proxy
1419
- });
1420
- if (req.isInterceptResolutionHandled()) {
1421
- return;
1422
- };
1423
-
1424
- if (curled.chain.length === 1) {
1425
- if (!curled.file) {
1426
- return req.respond({
1427
- status: curled.status,
1428
- headers: _.omit(curled.headers, 'result'),
1429
- contentType: curled.contentType,
1430
- }, 3);
1431
- }
1432
- const body = await readFile(await curled.file.filePath);
1433
- if (req.isInterceptResolutionHandled()) {
1434
- return;
1435
- };
1436
- return req.respond({
1437
- status: curled.status,
1438
- headers: _.omit(curled.headers, 'result'),
1439
- contentType: curled.contentType,
1440
- body: Uint8Array.from(body),
1441
- }, 3);
1442
- }
1443
- options.sideLoad ??= curled.sideLoadOpts;
1444
- _.merge(options.sideLoad, curled.sideLoadOpts);
1445
- const firstReq = curled.chain[0];
1446
-
1447
- return req.respond({
1448
- status: firstReq.result!.code,
1449
- headers: _.omit(firstReq, 'result'),
1450
- }, 3);
1451
- } catch (err: any) {
1452
- this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
1453
- }
1454
- if (req.isInterceptResolutionHandled()) {
1455
- return;
1456
- };
1457
- const overrides = req.continueRequestOverrides();
1458
- const continueArgs = [{
1459
- ...overrides,
1460
- headers: {
1461
- ...req.headers(),
1462
- ...overrides?.headers,
1463
- ...options.extraHeaders,
1464
- }
1465
- }, 1] as const;
1466
-
1467
- return req.continue(continueArgs[0], continueArgs[1]);
1468
- });
1469
- }
1470
-
1471
- if (req.isInterceptResolutionHandled()) {
1472
- return;
1473
- };
1474
- const overrides = req.continueRequestOverrides();
1475
- const continueArgs = [{
1476
- ...overrides,
1477
- headers: {
1478
- ...req.headers(),
1479
- ...overrides?.headers,
1480
- ...options.extraHeaders,
1481
- }
1482
- }, 1] as const;
1483
-
1484
- return req.continue(continueArgs[0], continueArgs[1]);
1485
- });
1486
-
1487
- const sn = this.snMap.get(page);
1488
- this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
1489
- if (options.locale) {
1490
- // Add headers via request interception to walk around this bug
1491
- // https://github.com/puppeteer/puppeteer/issues/10235
1492
- // await page.setExtraHTTPHeaders({
1493
- // 'Accept-Language': options.locale
1494
- // });
1495
-
1496
- await page.evaluateOnNewDocument(() => {
1497
- Object.defineProperty(navigator, "language", {
1498
- get: function () {
1499
- return options.locale;
1500
- }
1501
- });
1502
- Object.defineProperty(navigator, "languages", {
1503
- get: function () {
1504
- return [options.locale];
1505
- }
1506
- });
1507
- });
1508
- }
1509
-
1510
- if (options.cookies) {
1511
- const mapped = options.cookies.map((x) => {
1512
- const draft: CookieParam = {
1513
- name: x.name,
1514
- value: encodeURIComponent(x.value),
1515
- secure: x.secure,
1516
- domain: x.domain,
1517
- path: x.path,
1518
- expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
1519
- sameSite: x.sameSite as any,
1520
- };
1521
- if (!draft.expires && x.maxAge) {
1522
- draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
1523
- }
1524
- if (!draft.domain) {
1525
- draft.url = parsedUrl.toString();
1526
- }
1527
-
1528
- return draft;
1529
- });
1530
- try {
1531
- await page.setCookie(...mapped);
1532
- } catch (err: any) {
1533
- this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
1534
- throw new ParamValidationError({
1535
- path: 'cookies',
1536
- message: `Failed to set cookies: ${err?.message}`
1537
- });
1538
- }
1539
- }
1540
- if (options.overrideUserAgent) {
1541
- await page.setUserAgent(options.overrideUserAgent);
1542
- }
1543
- if (options.viewport) {
1544
- await page.setViewport(options.viewport);
1545
- }
1546
-
1547
- let nextSnapshotDeferred = Defer();
1548
- const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
1549
- this.once('crippled', crippleListener);
1550
- nextSnapshotDeferred.promise.finally(() => {
1551
- this.off('crippled', crippleListener);
1552
- });
1553
- let finalized = false;
1554
- const hdl = (s: any) => {
1555
- if (snapshot === s) {
1556
- return;
1557
- }
1558
- snapshot = s;
1559
- if (snapshot) {
1560
- const kit = this.pageReqCtrl.get(page);
1561
- snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
1562
- snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
1563
- }
1564
- if (s?.maxElemDepth && s.maxElemDepth > 256) {
1565
- return;
1566
- }
1567
- if (s?.elemCount && s.elemCount > 10_000) {
1568
- return;
1569
- }
1570
- nextSnapshotDeferred.resolve(s);
1571
- nextSnapshotDeferred = Defer();
1572
- this.once('crippled', crippleListener);
1573
- nextSnapshotDeferred.promise.finally(() => {
1574
- this.off('crippled', crippleListener);
1575
- });
1576
- };
1577
- page.on('snapshot', hdl);
1578
- page.once('abuse', (event: any) => {
1579
- this.emit('abuse', { ...event, url: parsedUrl });
1580
- if (snapshot?.href && parsedUrl.href !== snapshot.href) {
1581
- this.emit('abuse', { ...event, url: snapshot.href });
1582
- }
1583
-
1584
- nextSnapshotDeferred.reject(
1585
- new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
1586
- );
1587
- });
1588
-
1589
- const timeout = options.timeoutMs || 30_000;
1590
- const goToOptions: GoToOptions = {
1591
- waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
1592
- timeout,
1593
- };
1594
-
1595
- if (options.referer) {
1596
- goToOptions.referer = options.referer;
1597
- }
1598
-
1599
- const gotoPromise = page.goto(url, goToOptions)
1600
- .catch((err) => {
1601
- if (err instanceof TimeoutError) {
1602
- this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
1603
- return new AssertionFailureError({
1604
- message: `Failed to goto ${url}: ${err}`,
1605
- cause: err,
1606
- });
1607
- }
1608
-
1609
- this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
1610
- return new AssertionFailureError({
1611
- message: `Failed to goto ${url}: ${err}`,
1612
- cause: err,
1613
- });
1614
- }).then(async (stuff) => {
1615
- // This check is necessary because without snapshot, the condition of the page is unclear
1616
- // Calling evaluate directly may stall the process.
1617
- if (!snapshot) {
1618
- if (stuff instanceof Error) {
1619
- finalized = true;
1620
- throw stuff;
1621
- }
1622
- }
1623
- try {
1624
- snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1625
- } catch (err: any) {
1626
- this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
1627
- if (stuff instanceof Error) {
1628
- finalized = true;
1629
- throw stuff;
1630
- }
1631
- }
1632
- if (!snapshot?.html) {
1633
- if (stuff instanceof Error) {
1634
- finalized = true;
1635
- throw stuff;
1636
- }
1637
- }
1638
-
1639
- finalized = true;
1640
- if (snapshot?.html) {
1641
- this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
1642
- this.emit(
1643
- 'crawled',
1644
- {
1645
- ...snapshot,
1646
- status: navigationResponse?.status(),
1647
- statusText: navigationResponse?.statusText(),
1648
- },
1649
- { ...options, url: parsedUrl }
1650
- );
1651
- }
1652
- });
1653
-
1654
- try {
1655
- while (true) {
1656
- const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
1657
- if (options.minIntervalMs) {
1658
- ckpt.push(delay(options.minIntervalMs));
1659
- }
1660
- let error;
1661
- await Promise.race(ckpt).catch((err) => error = err);
1662
- if (finalized && !error) {
1663
- if (!snapshot) {
1664
- if (error) {
1665
- throw error;
1666
- }
1667
- throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
1668
- }
1669
- return {
1670
- ...snapshot,
1671
- status: navigationResponse?.status(),
1672
- statusText: navigationResponse?.statusText(),
1673
- } as PageSnapshot;
1674
- }
1675
-
1676
- if (snapshot?.lastMutationIdle) {
1677
- return {
1678
- ...snapshot,
1679
- status: navigationResponse?.status(),
1680
- statusText: navigationResponse?.statusText(),
1681
- } as PageSnapshot;
1682
- }
1683
- if (error) {
1684
- throw error;
1685
- }
1686
- }
1687
- } finally {
1688
- this.pagePhase.set(page, 'background');
1689
- page.off('snapshot', hdl);
1690
- this.ditchPage(page);
1691
- nextSnapshotDeferred.resolve();
1692
- }
1693
- }
1694
-
1695
  }
1696
 
1697
  const puppeteerControl = container.resolve(PuppeteerControl);
 
846
  async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
847
  // parsedUrl.search = '';
848
  const url = parsedUrl.toString();
 
849
  let snapshot: PageSnapshot | undefined;
850
  let screenshot: Buffer | undefined;
851
  let pageshot: Buffer | undefined;
 
1096
  nextSnapshotDeferred.promise.finally(() => {
1097
  this.off('crippled', crippleListener);
1098
  });
1099
+ let successfullyDone = false;
1100
  const hdl = (s: any) => {
1101
  if (snapshot === s) {
1102
  return;
 
1142
  goToOptions.referer = options.referer;
1143
  }
1144
 
1145
+ let waitForPromise: Promise<any> | undefined;
1146
+ let finalizationPromise: Promise<any> | undefined;
1147
+ const doFinalization = async () => {
1148
+ if (!waitForPromise) {
1149
+ successfullyDone = true;
1150
+ }
1151
+ try {
1152
+ const pSubFrameSnapshots = this.snapshotChildFrames(page);
1153
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1154
+ screenshot = (await this.takeScreenShot(page)) || screenshot;
1155
+ pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
1156
+ if (snapshot) {
1157
+ snapshot.childFrames = await pSubFrameSnapshots;
1158
+ }
1159
+ } catch (err: any) {
1160
+ this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
1161
+ }
1162
+ if (!snapshot?.html) {
1163
+ return;
1164
+ }
1165
+
1166
+ this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
1167
+ this.emit(
1168
+ 'crawled',
1169
+ {
1170
+ ...snapshot,
1171
+ status: navigationResponse?.status(),
1172
+ statusText: navigationResponse?.statusText(),
1173
+ pdfs: _.uniq(pdfUrls), screenshot, pageshot,
1174
+ },
1175
+ { ...options, url: parsedUrl }
1176
+ );
1177
+ };
1178
  const delayPromise = delay(timeout);
1179
  const gotoPromise = page.goto(url, goToOptions)
1180
  .catch((err) => {
 
1202
  // Calling evaluate directly may stall the process.
1203
  if (!snapshot) {
1204
  if (stuff instanceof Error) {
 
1205
  throw stuff;
1206
  }
1207
  }
1208
  await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
1209
  .catch(() => void 0);
1210
+ finalizationPromise = doFinalization();
1211
+ return stuff;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1212
  });
 
1213
  if (options.waitForSelector) {
1214
  const t0 = Date.now();
1215
  waitForPromise = nextSnapshotDeferred.promise.then(() => {
 
1220
  const p = (Array.isArray(options.waitForSelector) ?
1221
  Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
1222
  page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
1223
+ .then(() => {
1224
+ successfullyDone = true;
1225
+ finalizationPromise = doFinalization();
 
 
 
 
 
 
1226
  })
1227
  .catch((err) => {
1228
  this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
 
1229
  });
1230
  return p as any;
1231
  });
 
1243
  }
1244
  let error;
1245
  await Promise.race(ckpt).catch((err) => error = err);
1246
+ if (successfullyDone && !error) {
1247
  if (!snapshot && !screenshot) {
 
 
 
1248
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
1249
  }
1250
  yield {
 
1272
  if (error) {
1273
  throw error;
1274
  }
1275
+ if (successfullyDone) {
1276
+ break;
1277
+ }
1278
  }
1279
+ await finalizationPromise;
1280
+ yield {
1281
+ ...snapshot,
1282
+ status: navigationResponse?.status(),
1283
+ statusText: navigationResponse?.statusText(),
1284
+ pdfs: _.uniq(pdfUrls), screenshot, pageshot
1285
+ } as PageSnapshot;
1286
  } finally {
1287
  this.pagePhase.set(page, 'background');
1288
+ Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => {
1289
  page.off('snapshot', hdl);
1290
  this.ditchPage(page);
1291
  });
 
1325
  return r.filter(Boolean);
1326
  }
1327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1328
  }
1329
 
1330
  const puppeteerControl = container.resolve(PuppeteerControl);