nomagick commited on
Commit
9fbd751
·
unverified ·
1 Parent(s): 3dc902e

fix: expose status code/text from curl

Browse files
src/api/crawler.ts CHANGED
@@ -9,6 +9,7 @@ import {
9
  RawString,
10
  ApplicationError,
11
  DataStreamBrokenError,
 
12
  } from 'civkit/civ-rpc';
13
  import { marshalErrorLike } from 'civkit/lang';
14
  import { Defer } from 'civkit/defer';
@@ -755,6 +756,8 @@ export class CrawlerHost extends RPCHost {
755
  throw new AssertionFailureError(`Remote server did not return a body: ${urlToCrawl}`);
756
  }
757
  const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
 
 
758
  yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
759
  return;
760
  }
@@ -822,6 +825,8 @@ export class CrawlerHost extends RPCHost {
822
  }
823
  return Promise.reject(err);
824
  });
 
 
825
  if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
826
  yield draftSnapshot;
827
  return;
@@ -849,6 +854,8 @@ export class CrawlerHost extends RPCHost {
849
  }
850
  return Promise.reject(err);
851
  });
 
 
852
  if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
853
  }
854
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
@@ -931,6 +938,7 @@ export class CrawlerHost extends RPCHost {
931
  }
932
 
933
  Object.assign(formatted, { usage: { tokens: amount } });
 
934
 
935
  return amount;
936
  }
 
9
  RawString,
10
  ApplicationError,
11
  DataStreamBrokenError,
12
+ assignMeta,
13
  } from 'civkit/civ-rpc';
14
  import { marshalErrorLike } from 'civkit/lang';
15
  import { Defer } from 'civkit/defer';
 
756
  throw new AssertionFailureError(`Remote server did not return a body: ${urlToCrawl}`);
757
  }
758
  const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
759
+ draftSnapshot.status = sideLoaded.status;
760
+ draftSnapshot.statusText = sideLoaded.statusText;
761
  yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
762
  return;
763
  }
 
825
  }
826
  return Promise.reject(err);
827
  });
828
+ draftSnapshot.status = sideLoaded.status;
829
+ draftSnapshot.statusText = sideLoaded.statusText;
830
  if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
831
  yield draftSnapshot;
832
  return;
 
854
  }
855
  return Promise.reject(err);
856
  });
857
+ proxySnapshot.status = proxyLoaded.status;
858
+ proxySnapshot.statusText = proxyLoaded.statusText;
859
  if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
860
  }
861
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
 
938
  }
939
 
940
  Object.assign(formatted, { usage: { tokens: amount } });
941
+ assignMeta(formatted, { usage: { tokens: amount } });
942
 
943
  return amount;
944
  }
src/services/curl.ts CHANGED
@@ -98,6 +98,7 @@ export class CurlControl extends AsyncService {
98
  urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
99
  return new Promise<{
100
  statusCode: number,
 
101
  data?: FancyFile,
102
  headers: HeaderInfo[],
103
  }>((resolve, reject) => {
@@ -179,6 +180,7 @@ export class CurlControl extends AsyncService {
179
  });
180
  curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
181
  let status = -1;
 
182
  let contentEncoding = '';
183
  curl.once('end', () => {
184
  if (curlStream) {
@@ -208,6 +210,7 @@ export class CurlControl extends AsyncService {
208
  }
209
  }
210
  const lastResHeaders = headers[headers.length - 1];
 
211
  for (const [k, v] of Object.entries(lastResHeaders)) {
212
  const kl = k.toLowerCase();
213
  if (kl === 'content-type') {
@@ -227,6 +230,7 @@ export class CurlControl extends AsyncService {
227
  }
228
  resolve({
229
  statusCode: status,
 
230
  data: undefined,
231
  headers: headers as HeaderInfo[],
232
  });
@@ -236,6 +240,7 @@ export class CurlControl extends AsyncService {
236
  if (!stream) {
237
  resolve({
238
  statusCode: status,
 
239
  data: undefined,
240
  headers: headers as HeaderInfo[],
241
  });
@@ -289,6 +294,7 @@ export class CurlControl extends AsyncService {
289
  this.tempFileManager.bindPathTo(fancyFile, fpath);
290
  resolve({
291
  statusCode: status,
 
292
  data: fancyFile,
293
  headers: headers as HeaderInfo[],
294
  });
@@ -343,6 +349,7 @@ export class CurlControl extends AsyncService {
343
 
344
  return {
345
  statusCode: r.statusCode,
 
346
  data: r.data,
347
  headers: fakeHeaderInfos.concat(r.headers),
348
  };
@@ -392,6 +399,7 @@ export class CurlControl extends AsyncService {
392
  sideLoadOpts,
393
  chain: curlResult.headers,
394
  status: curlResult.statusCode,
 
395
  headers: lastHeaders,
396
  contentType,
397
  contentDisposition,
 
98
  urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
99
  return new Promise<{
100
  statusCode: number,
101
+ statusText?: string,
102
  data?: FancyFile,
103
  headers: HeaderInfo[],
104
  }>((resolve, reject) => {
 
180
  });
181
  curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
182
  let status = -1;
183
+ let statusText: string|undefined;
184
  let contentEncoding = '';
185
  curl.once('end', () => {
186
  if (curlStream) {
 
210
  }
211
  }
212
  const lastResHeaders = headers[headers.length - 1];
213
+ statusText = (lastResHeaders as HeaderInfo).result?.reason;
214
  for (const [k, v] of Object.entries(lastResHeaders)) {
215
  const kl = k.toLowerCase();
216
  if (kl === 'content-type') {
 
230
  }
231
  resolve({
232
  statusCode: status,
233
+ statusText,
234
  data: undefined,
235
  headers: headers as HeaderInfo[],
236
  });
 
240
  if (!stream) {
241
  resolve({
242
  statusCode: status,
243
+ statusText,
244
  data: undefined,
245
  headers: headers as HeaderInfo[],
246
  });
 
294
  this.tempFileManager.bindPathTo(fancyFile, fpath);
295
  resolve({
296
  statusCode: status,
297
+ statusText,
298
  data: fancyFile,
299
  headers: headers as HeaderInfo[],
300
  });
 
349
 
350
  return {
351
  statusCode: r.statusCode,
352
+ statusText: r.statusText,
353
  data: r.data,
354
  headers: fakeHeaderInfos.concat(r.headers),
355
  };
 
399
  sideLoadOpts,
400
  chain: curlResult.headers,
401
  status: curlResult.statusCode,
402
+ statusText: curlResult.statusText,
403
  headers: lastHeaders,
404
  contentType,
405
  contentDisposition,
src/services/puppeteer.ts CHANGED
@@ -1176,8 +1176,8 @@ export class PuppeteerControl extends AsyncService {
1176
  try {
1177
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
1178
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1179
- screenshot = await this.takeScreenShot(page);
1180
- pageshot = await this.takeScreenShot(page, { fullPage: true });
1181
  if (snapshot) {
1182
  snapshot.childFrames = await pSubFrameSnapshots;
1183
  }
@@ -1224,8 +1224,8 @@ export class PuppeteerControl extends AsyncService {
1224
  .then(async () => {
1225
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
1226
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1227
- screenshot = await this.takeScreenShot(page);
1228
- pageshot = await this.takeScreenShot(page, { fullPage: true });
1229
  if (snapshot) {
1230
  snapshot.childFrames = await pSubFrameSnapshots;
1231
  }
@@ -1267,8 +1267,8 @@ export class PuppeteerControl extends AsyncService {
1267
  break;
1268
  }
1269
  if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
1270
- screenshot = await this.takeScreenShot(page);
1271
- pageshot = await this.takeScreenShot(page, { fullPage: true });
1272
  lastHTML = snapshot.html;
1273
  }
1274
  if (snapshot || screenshot) {
@@ -1326,6 +1326,373 @@ export class PuppeteerControl extends AsyncService {
1326
  return r.filter(Boolean);
1327
  }
1328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1329
  }
1330
 
1331
  const puppeteerControl = container.resolve(PuppeteerControl);
 
1176
  try {
1177
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
1178
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1179
+ screenshot = (await this.takeScreenShot(page)) || screenshot;
1180
+ pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
1181
  if (snapshot) {
1182
  snapshot.childFrames = await pSubFrameSnapshots;
1183
  }
 
1224
  .then(async () => {
1225
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
1226
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1227
+ screenshot = (await this.takeScreenShot(page)) || screenshot;
1228
+ pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
1229
  if (snapshot) {
1230
  snapshot.childFrames = await pSubFrameSnapshots;
1231
  }
 
1267
  break;
1268
  }
1269
  if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
1270
+ screenshot = (await this.takeScreenShot(page)) || screenshot;
1271
+ pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
1272
  lastHTML = snapshot.html;
1273
  }
1274
  if (snapshot || screenshot) {
 
1326
  return r.filter(Boolean);
1327
  }
1328
 
1329
+ async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise<PageSnapshot> {
1330
+ // parsedUrl.search = '';
1331
+ const url = parsedUrl.toString();
1332
+ let snapshot: PageSnapshot | undefined;
1333
+ let navigationResponse: HTTPResponse | undefined;
1334
+ const page = await this.getNextPage();
1335
+ this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
1336
+ this.pagePhase.set(page, 'active');
1337
+ page.on('response', (resp) => {
1338
+ this.blackHoleDetector.itWorked();
1339
+ const req = resp.request();
1340
+ if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
1341
+ navigationResponse = resp;
1342
+ }
1343
+ if (!resp.ok()) {
1344
+ return;
1345
+ }
1346
+ });
1347
+ page.on('request', async (req) => {
1348
+ if (req.isInterceptResolutionHandled()) {
1349
+ return;
1350
+ };
1351
+ const reqUrlParsed = new URL(req.url());
1352
+ if (!reqUrlParsed.protocol.startsWith('http')) {
1353
+ const overrides = req.continueRequestOverrides();
1354
+
1355
+ return req.continue(overrides, 0);
1356
+ }
1357
+ const typ = req.resourceType();
1358
+ if (typ === 'media') {
1359
+ // Non-cooperative answer to block all media requests.
1360
+ return req.abort('blockedbyclient');
1361
+ }
1362
+ if (!options.proxyResources) {
1363
+ const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
1364
+ if (!isDocRequest) {
1365
+ if (options.extraHeaders) {
1366
+ const overrides = req.continueRequestOverrides();
1367
+ const continueArgs = [{
1368
+ ...overrides,
1369
+ headers: {
1370
+ ...req.headers(),
1371
+ ...overrides?.headers,
1372
+ ...options.extraHeaders,
1373
+ }
1374
+ }, 1] as const;
1375
+
1376
+ return req.continue(continueArgs[0], continueArgs[1]);
1377
+ }
1378
+ const overrides = req.continueRequestOverrides();
1379
+
1380
+ return req.continue(overrides, 0);
1381
+ }
1382
+ }
1383
+ const sideload = options.sideLoad;
1384
+
1385
+ const impersonate = sideload?.impersonate[reqUrlParsed.href];
1386
+ if (impersonate) {
1387
+ let body;
1388
+ if (impersonate.body) {
1389
+ body = await readFile(await impersonate.body.filePath);
1390
+ if (req.isInterceptResolutionHandled()) {
1391
+ return;
1392
+ }
1393
+ }
1394
+ return req.respond({
1395
+ status: impersonate.status,
1396
+ headers: impersonate.headers,
1397
+ contentType: impersonate.contentType,
1398
+ body: body ? Uint8Array.from(body) : undefined,
1399
+ }, 999);
1400
+ }
1401
+
1402
+ const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
1403
+ const ctx = this.lifeCycleTrack.get(page);
1404
+ if (proxy && ctx) {
1405
+ return await this.asyncLocalContext.bridge(ctx, async () => {
1406
+ try {
1407
+ const curled = await this.curlControl.sideLoad(reqUrlParsed, {
1408
+ ...options,
1409
+ method: req.method(),
1410
+ body: req.postData(),
1411
+ extraHeaders: {
1412
+ ...req.headers(),
1413
+ ...options.extraHeaders,
1414
+ },
1415
+ proxyUrl: proxy
1416
+ });
1417
+ if (req.isInterceptResolutionHandled()) {
1418
+ return;
1419
+ };
1420
+
1421
+ if (curled.chain.length === 1) {
1422
+ if (!curled.file) {
1423
+ return req.respond({
1424
+ status: curled.status,
1425
+ headers: _.omit(curled.headers, 'result'),
1426
+ contentType: curled.contentType,
1427
+ }, 3);
1428
+ }
1429
+ const body = await readFile(await curled.file.filePath);
1430
+ if (req.isInterceptResolutionHandled()) {
1431
+ return;
1432
+ };
1433
+ return req.respond({
1434
+ status: curled.status,
1435
+ headers: _.omit(curled.headers, 'result'),
1436
+ contentType: curled.contentType,
1437
+ body: Uint8Array.from(body),
1438
+ }, 3);
1439
+ }
1440
+ options.sideLoad ??= curled.sideLoadOpts;
1441
+ _.merge(options.sideLoad, curled.sideLoadOpts);
1442
+ const firstReq = curled.chain[0];
1443
+
1444
+ return req.respond({
1445
+ status: firstReq.result!.code,
1446
+ headers: _.omit(firstReq, 'result'),
1447
+ }, 3);
1448
+ } catch (err: any) {
1449
+ this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
1450
+ }
1451
+ if (req.isInterceptResolutionHandled()) {
1452
+ return;
1453
+ };
1454
+ const overrides = req.continueRequestOverrides();
1455
+ const continueArgs = [{
1456
+ ...overrides,
1457
+ headers: {
1458
+ ...req.headers(),
1459
+ ...overrides?.headers,
1460
+ ...options.extraHeaders,
1461
+ }
1462
+ }, 1] as const;
1463
+
1464
+ return req.continue(continueArgs[0], continueArgs[1]);
1465
+ });
1466
+ }
1467
+
1468
+ if (req.isInterceptResolutionHandled()) {
1469
+ return;
1470
+ };
1471
+ const overrides = req.continueRequestOverrides();
1472
+ const continueArgs = [{
1473
+ ...overrides,
1474
+ headers: {
1475
+ ...req.headers(),
1476
+ ...overrides?.headers,
1477
+ ...options.extraHeaders,
1478
+ }
1479
+ }, 1] as const;
1480
+
1481
+ return req.continue(continueArgs[0], continueArgs[1]);
1482
+ });
1483
+
1484
+ const sn = this.snMap.get(page);
1485
+ this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
1486
+ if (options.locale) {
1487
+ // Add headers via request interception to walk around this bug
1488
+ // https://github.com/puppeteer/puppeteer/issues/10235
1489
+ // await page.setExtraHTTPHeaders({
1490
+ // 'Accept-Language': options.locale
1491
+ // });
1492
+
1493
+ await page.evaluateOnNewDocument(() => {
1494
+ Object.defineProperty(navigator, "language", {
1495
+ get: function () {
1496
+ return options.locale;
1497
+ }
1498
+ });
1499
+ Object.defineProperty(navigator, "languages", {
1500
+ get: function () {
1501
+ return [options.locale];
1502
+ }
1503
+ });
1504
+ });
1505
+ }
1506
+
1507
+ if (options.cookies) {
1508
+ const mapped = options.cookies.map((x) => {
1509
+ const draft: CookieParam = {
1510
+ name: x.name,
1511
+ value: encodeURIComponent(x.value),
1512
+ secure: x.secure,
1513
+ domain: x.domain,
1514
+ path: x.path,
1515
+ expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
1516
+ sameSite: x.sameSite as any,
1517
+ };
1518
+ if (!draft.expires && x.maxAge) {
1519
+ draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
1520
+ }
1521
+ if (!draft.domain) {
1522
+ draft.url = parsedUrl.toString();
1523
+ }
1524
+
1525
+ return draft;
1526
+ });
1527
+ try {
1528
+ await page.setCookie(...mapped);
1529
+ } catch (err: any) {
1530
+ this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
1531
+ throw new ParamValidationError({
1532
+ path: 'cookies',
1533
+ message: `Failed to set cookies: ${err?.message}`
1534
+ });
1535
+ }
1536
+ }
1537
+ if (options.overrideUserAgent) {
1538
+ await page.setUserAgent(options.overrideUserAgent);
1539
+ }
1540
+ if (options.viewport) {
1541
+ await page.setViewport(options.viewport);
1542
+ }
1543
+
1544
+ let nextSnapshotDeferred = Defer();
1545
+ const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
1546
+ this.once('crippled', crippleListener);
1547
+ nextSnapshotDeferred.promise.finally(() => {
1548
+ this.off('crippled', crippleListener);
1549
+ });
1550
+ let finalized = false;
1551
+ const hdl = (s: any) => {
1552
+ if (snapshot === s) {
1553
+ return;
1554
+ }
1555
+ snapshot = s;
1556
+ if (snapshot) {
1557
+ const kit = this.pageReqCtrl.get(page);
1558
+ snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
1559
+ snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
1560
+ }
1561
+ if (s?.maxElemDepth && s.maxElemDepth > 256) {
1562
+ return;
1563
+ }
1564
+ if (s?.elemCount && s.elemCount > 10_000) {
1565
+ return;
1566
+ }
1567
+ nextSnapshotDeferred.resolve(s);
1568
+ nextSnapshotDeferred = Defer();
1569
+ this.once('crippled', crippleListener);
1570
+ nextSnapshotDeferred.promise.finally(() => {
1571
+ this.off('crippled', crippleListener);
1572
+ });
1573
+ };
1574
+ page.on('snapshot', hdl);
1575
+ page.once('abuse', (event: any) => {
1576
+ this.emit('abuse', { ...event, url: parsedUrl });
1577
+ if (snapshot?.href && parsedUrl.href !== snapshot.href) {
1578
+ this.emit('abuse', { ...event, url: snapshot.href });
1579
+ }
1580
+
1581
+ nextSnapshotDeferred.reject(
1582
+ new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
1583
+ );
1584
+ });
1585
+
1586
+ const timeout = options.timeoutMs || 30_000;
1587
+ const goToOptions: GoToOptions = {
1588
+ waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
1589
+ timeout,
1590
+ };
1591
+
1592
+ if (options.referer) {
1593
+ goToOptions.referer = options.referer;
1594
+ }
1595
+
1596
+ const gotoPromise = page.goto(url, goToOptions)
1597
+ .catch((err) => {
1598
+ if (err instanceof TimeoutError) {
1599
+ this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
1600
+ return new AssertionFailureError({
1601
+ message: `Failed to goto ${url}: ${err}`,
1602
+ cause: err,
1603
+ });
1604
+ }
1605
+
1606
+ this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
1607
+ return new AssertionFailureError({
1608
+ message: `Failed to goto ${url}: ${err}`,
1609
+ cause: err,
1610
+ });
1611
+ }).then(async (stuff) => {
1612
+ // This check is necessary because without snapshot, the condition of the page is unclear
1613
+ // Calling evaluate directly may stall the process.
1614
+ if (!snapshot) {
1615
+ if (stuff instanceof Error) {
1616
+ finalized = true;
1617
+ throw stuff;
1618
+ }
1619
+ }
1620
+ try {
1621
+ const pSubFrameSnapshots = this.snapshotChildFrames(page);
1622
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1623
+ if (snapshot) {
1624
+ snapshot.childFrames = await pSubFrameSnapshots;
1625
+ }
1626
+ } catch (err: any) {
1627
+ this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
1628
+ if (stuff instanceof Error) {
1629
+ finalized = true;
1630
+ throw stuff;
1631
+ }
1632
+ }
1633
+ if (!snapshot?.html) {
1634
+ if (stuff instanceof Error) {
1635
+ finalized = true;
1636
+ throw stuff;
1637
+ }
1638
+ }
1639
+
1640
+ finalized = true;
1641
+ if (snapshot?.html) {
1642
+ this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
1643
+ this.emit(
1644
+ 'crawled',
1645
+ {
1646
+ ...snapshot,
1647
+ status: navigationResponse?.status(),
1648
+ statusText: navigationResponse?.statusText(),
1649
+ },
1650
+ { ...options, url: parsedUrl }
1651
+ );
1652
+ }
1653
+ });
1654
+
1655
+ try {
1656
+ while (true) {
1657
+ const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
1658
+ if (options.minIntervalMs) {
1659
+ ckpt.push(delay(options.minIntervalMs));
1660
+ }
1661
+ let error;
1662
+ await Promise.race(ckpt).catch((err) => error = err);
1663
+ if (finalized && !error) {
1664
+ if (!snapshot) {
1665
+ if (error) {
1666
+ throw error;
1667
+ }
1668
+ throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
1669
+ }
1670
+ return {
1671
+ ...snapshot,
1672
+ status: navigationResponse?.status(),
1673
+ statusText: navigationResponse?.statusText(),
1674
+ } as PageSnapshot;
1675
+ }
1676
+
1677
+ if (snapshot?.lastMutationIdle) {
1678
+ return {
1679
+ ...snapshot,
1680
+ status: navigationResponse?.status(),
1681
+ statusText: navigationResponse?.statusText(),
1682
+ } as PageSnapshot;
1683
+ }
1684
+ if (error) {
1685
+ throw error;
1686
+ }
1687
+ }
1688
+ } finally {
1689
+ this.pagePhase.set(page, 'background');
1690
+ page.off('snapshot', hdl);
1691
+ this.ditchPage(page);
1692
+ nextSnapshotDeferred.resolve();
1693
+ }
1694
+ }
1695
+
1696
  }
1697
 
1698
  const puppeteerControl = container.resolve(PuppeteerControl);
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 8c31e85dc52dfcc7d1d86df0328df3a94319b534
 
1
+ Subproject commit 07d23193d85b1d3c8bbd5d0b024a6884ecfe17fd