Spaces:
Build error
Build error
fix: expose status code/text from curl
Browse files- src/api/crawler.ts +8 -0
- src/services/curl.ts +8 -0
- src/services/puppeteer.ts +373 -6
- thinapps-shared +1 -1
src/api/crawler.ts
CHANGED
|
@@ -9,6 +9,7 @@ import {
|
|
| 9 |
RawString,
|
| 10 |
ApplicationError,
|
| 11 |
DataStreamBrokenError,
|
|
|
|
| 12 |
} from 'civkit/civ-rpc';
|
| 13 |
import { marshalErrorLike } from 'civkit/lang';
|
| 14 |
import { Defer } from 'civkit/defer';
|
|
@@ -755,6 +756,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 755 |
throw new AssertionFailureError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 756 |
}
|
| 757 |
const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
|
|
|
|
|
|
| 758 |
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
| 759 |
return;
|
| 760 |
}
|
|
@@ -822,6 +825,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 822 |
}
|
| 823 |
return Promise.reject(err);
|
| 824 |
});
|
|
|
|
|
|
|
| 825 |
if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
|
| 826 |
yield draftSnapshot;
|
| 827 |
return;
|
|
@@ -849,6 +854,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 849 |
}
|
| 850 |
return Promise.reject(err);
|
| 851 |
});
|
|
|
|
|
|
|
| 852 |
if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
|
| 853 |
}
|
| 854 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
|
@@ -931,6 +938,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 931 |
}
|
| 932 |
|
| 933 |
Object.assign(formatted, { usage: { tokens: amount } });
|
|
|
|
| 934 |
|
| 935 |
return amount;
|
| 936 |
}
|
|
|
|
| 9 |
RawString,
|
| 10 |
ApplicationError,
|
| 11 |
DataStreamBrokenError,
|
| 12 |
+
assignMeta,
|
| 13 |
} from 'civkit/civ-rpc';
|
| 14 |
import { marshalErrorLike } from 'civkit/lang';
|
| 15 |
import { Defer } from 'civkit/defer';
|
|
|
|
| 756 |
throw new AssertionFailureError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 757 |
}
|
| 758 |
const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
| 759 |
+
draftSnapshot.status = sideLoaded.status;
|
| 760 |
+
draftSnapshot.statusText = sideLoaded.statusText;
|
| 761 |
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
| 762 |
return;
|
| 763 |
}
|
|
|
|
| 825 |
}
|
| 826 |
return Promise.reject(err);
|
| 827 |
});
|
| 828 |
+
draftSnapshot.status = sideLoaded.status;
|
| 829 |
+
draftSnapshot.statusText = sideLoaded.statusText;
|
| 830 |
if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
|
| 831 |
yield draftSnapshot;
|
| 832 |
return;
|
|
|
|
| 854 |
}
|
| 855 |
return Promise.reject(err);
|
| 856 |
});
|
| 857 |
+
proxySnapshot.status = proxyLoaded.status;
|
| 858 |
+
proxySnapshot.statusText = proxyLoaded.statusText;
|
| 859 |
if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
|
| 860 |
}
|
| 861 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
|
|
|
| 938 |
}
|
| 939 |
|
| 940 |
Object.assign(formatted, { usage: { tokens: amount } });
|
| 941 |
+
assignMeta(formatted, { usage: { tokens: amount } });
|
| 942 |
|
| 943 |
return amount;
|
| 944 |
}
|
src/services/curl.ts
CHANGED
|
@@ -98,6 +98,7 @@ export class CurlControl extends AsyncService {
|
|
| 98 |
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
| 99 |
return new Promise<{
|
| 100 |
statusCode: number,
|
|
|
|
| 101 |
data?: FancyFile,
|
| 102 |
headers: HeaderInfo[],
|
| 103 |
}>((resolve, reject) => {
|
|
@@ -179,6 +180,7 @@ export class CurlControl extends AsyncService {
|
|
| 179 |
});
|
| 180 |
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
|
| 181 |
let status = -1;
|
|
|
|
| 182 |
let contentEncoding = '';
|
| 183 |
curl.once('end', () => {
|
| 184 |
if (curlStream) {
|
|
@@ -208,6 +210,7 @@ export class CurlControl extends AsyncService {
|
|
| 208 |
}
|
| 209 |
}
|
| 210 |
const lastResHeaders = headers[headers.length - 1];
|
|
|
|
| 211 |
for (const [k, v] of Object.entries(lastResHeaders)) {
|
| 212 |
const kl = k.toLowerCase();
|
| 213 |
if (kl === 'content-type') {
|
|
@@ -227,6 +230,7 @@ export class CurlControl extends AsyncService {
|
|
| 227 |
}
|
| 228 |
resolve({
|
| 229 |
statusCode: status,
|
|
|
|
| 230 |
data: undefined,
|
| 231 |
headers: headers as HeaderInfo[],
|
| 232 |
});
|
|
@@ -236,6 +240,7 @@ export class CurlControl extends AsyncService {
|
|
| 236 |
if (!stream) {
|
| 237 |
resolve({
|
| 238 |
statusCode: status,
|
|
|
|
| 239 |
data: undefined,
|
| 240 |
headers: headers as HeaderInfo[],
|
| 241 |
});
|
|
@@ -289,6 +294,7 @@ export class CurlControl extends AsyncService {
|
|
| 289 |
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
| 290 |
resolve({
|
| 291 |
statusCode: status,
|
|
|
|
| 292 |
data: fancyFile,
|
| 293 |
headers: headers as HeaderInfo[],
|
| 294 |
});
|
|
@@ -343,6 +349,7 @@ export class CurlControl extends AsyncService {
|
|
| 343 |
|
| 344 |
return {
|
| 345 |
statusCode: r.statusCode,
|
|
|
|
| 346 |
data: r.data,
|
| 347 |
headers: fakeHeaderInfos.concat(r.headers),
|
| 348 |
};
|
|
@@ -392,6 +399,7 @@ export class CurlControl extends AsyncService {
|
|
| 392 |
sideLoadOpts,
|
| 393 |
chain: curlResult.headers,
|
| 394 |
status: curlResult.statusCode,
|
|
|
|
| 395 |
headers: lastHeaders,
|
| 396 |
contentType,
|
| 397 |
contentDisposition,
|
|
|
|
| 98 |
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
| 99 |
return new Promise<{
|
| 100 |
statusCode: number,
|
| 101 |
+
statusText?: string,
|
| 102 |
data?: FancyFile,
|
| 103 |
headers: HeaderInfo[],
|
| 104 |
}>((resolve, reject) => {
|
|
|
|
| 180 |
});
|
| 181 |
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
|
| 182 |
let status = -1;
|
| 183 |
+
let statusText: string|undefined;
|
| 184 |
let contentEncoding = '';
|
| 185 |
curl.once('end', () => {
|
| 186 |
if (curlStream) {
|
|
|
|
| 210 |
}
|
| 211 |
}
|
| 212 |
const lastResHeaders = headers[headers.length - 1];
|
| 213 |
+
statusText = (lastResHeaders as HeaderInfo).result?.reason;
|
| 214 |
for (const [k, v] of Object.entries(lastResHeaders)) {
|
| 215 |
const kl = k.toLowerCase();
|
| 216 |
if (kl === 'content-type') {
|
|
|
|
| 230 |
}
|
| 231 |
resolve({
|
| 232 |
statusCode: status,
|
| 233 |
+
statusText,
|
| 234 |
data: undefined,
|
| 235 |
headers: headers as HeaderInfo[],
|
| 236 |
});
|
|
|
|
| 240 |
if (!stream) {
|
| 241 |
resolve({
|
| 242 |
statusCode: status,
|
| 243 |
+
statusText,
|
| 244 |
data: undefined,
|
| 245 |
headers: headers as HeaderInfo[],
|
| 246 |
});
|
|
|
|
| 294 |
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
| 295 |
resolve({
|
| 296 |
statusCode: status,
|
| 297 |
+
statusText,
|
| 298 |
data: fancyFile,
|
| 299 |
headers: headers as HeaderInfo[],
|
| 300 |
});
|
|
|
|
| 349 |
|
| 350 |
return {
|
| 351 |
statusCode: r.statusCode,
|
| 352 |
+
statusText: r.statusText,
|
| 353 |
data: r.data,
|
| 354 |
headers: fakeHeaderInfos.concat(r.headers),
|
| 355 |
};
|
|
|
|
| 399 |
sideLoadOpts,
|
| 400 |
chain: curlResult.headers,
|
| 401 |
status: curlResult.statusCode,
|
| 402 |
+
statusText: curlResult.statusText,
|
| 403 |
headers: lastHeaders,
|
| 404 |
contentType,
|
| 405 |
contentDisposition,
|
src/services/puppeteer.ts
CHANGED
|
@@ -1176,8 +1176,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1176 |
try {
|
| 1177 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 1178 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1179 |
-
screenshot = await this.takeScreenShot(page);
|
| 1180 |
-
pageshot = await this.takeScreenShot(page, { fullPage: true });
|
| 1181 |
if (snapshot) {
|
| 1182 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1183 |
}
|
|
@@ -1224,8 +1224,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1224 |
.then(async () => {
|
| 1225 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 1226 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1227 |
-
screenshot = await this.takeScreenShot(page);
|
| 1228 |
-
pageshot = await this.takeScreenShot(page, { fullPage: true });
|
| 1229 |
if (snapshot) {
|
| 1230 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1231 |
}
|
|
@@ -1267,8 +1267,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1267 |
break;
|
| 1268 |
}
|
| 1269 |
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 1270 |
-
screenshot = await this.takeScreenShot(page);
|
| 1271 |
-
pageshot = await this.takeScreenShot(page, { fullPage: true });
|
| 1272 |
lastHTML = snapshot.html;
|
| 1273 |
}
|
| 1274 |
if (snapshot || screenshot) {
|
|
@@ -1326,6 +1326,373 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1326 |
return r.filter(Boolean);
|
| 1327 |
}
|
| 1328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1329 |
}
|
| 1330 |
|
| 1331 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
|
|
|
| 1176 |
try {
|
| 1177 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 1178 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1179 |
+
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
| 1180 |
+
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
| 1181 |
if (snapshot) {
|
| 1182 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1183 |
}
|
|
|
|
| 1224 |
.then(async () => {
|
| 1225 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 1226 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1227 |
+
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
| 1228 |
+
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
| 1229 |
if (snapshot) {
|
| 1230 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1231 |
}
|
|
|
|
| 1267 |
break;
|
| 1268 |
}
|
| 1269 |
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 1270 |
+
screenshot = (await this.takeScreenShot(page)) || screenshot;
|
| 1271 |
+
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
|
| 1272 |
lastHTML = snapshot.html;
|
| 1273 |
}
|
| 1274 |
if (snapshot || screenshot) {
|
|
|
|
| 1326 |
return r.filter(Boolean);
|
| 1327 |
}
|
| 1328 |
|
| 1329 |
+
async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise<PageSnapshot> {
|
| 1330 |
+
// parsedUrl.search = '';
|
| 1331 |
+
const url = parsedUrl.toString();
|
| 1332 |
+
let snapshot: PageSnapshot | undefined;
|
| 1333 |
+
let navigationResponse: HTTPResponse | undefined;
|
| 1334 |
+
const page = await this.getNextPage();
|
| 1335 |
+
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
| 1336 |
+
this.pagePhase.set(page, 'active');
|
| 1337 |
+
page.on('response', (resp) => {
|
| 1338 |
+
this.blackHoleDetector.itWorked();
|
| 1339 |
+
const req = resp.request();
|
| 1340 |
+
if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
|
| 1341 |
+
navigationResponse = resp;
|
| 1342 |
+
}
|
| 1343 |
+
if (!resp.ok()) {
|
| 1344 |
+
return;
|
| 1345 |
+
}
|
| 1346 |
+
});
|
| 1347 |
+
page.on('request', async (req) => {
|
| 1348 |
+
if (req.isInterceptResolutionHandled()) {
|
| 1349 |
+
return;
|
| 1350 |
+
};
|
| 1351 |
+
const reqUrlParsed = new URL(req.url());
|
| 1352 |
+
if (!reqUrlParsed.protocol.startsWith('http')) {
|
| 1353 |
+
const overrides = req.continueRequestOverrides();
|
| 1354 |
+
|
| 1355 |
+
return req.continue(overrides, 0);
|
| 1356 |
+
}
|
| 1357 |
+
const typ = req.resourceType();
|
| 1358 |
+
if (typ === 'media') {
|
| 1359 |
+
// Non-cooperative answer to block all media requests.
|
| 1360 |
+
return req.abort('blockedbyclient');
|
| 1361 |
+
}
|
| 1362 |
+
if (!options.proxyResources) {
|
| 1363 |
+
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
| 1364 |
+
if (!isDocRequest) {
|
| 1365 |
+
if (options.extraHeaders) {
|
| 1366 |
+
const overrides = req.continueRequestOverrides();
|
| 1367 |
+
const continueArgs = [{
|
| 1368 |
+
...overrides,
|
| 1369 |
+
headers: {
|
| 1370 |
+
...req.headers(),
|
| 1371 |
+
...overrides?.headers,
|
| 1372 |
+
...options.extraHeaders,
|
| 1373 |
+
}
|
| 1374 |
+
}, 1] as const;
|
| 1375 |
+
|
| 1376 |
+
return req.continue(continueArgs[0], continueArgs[1]);
|
| 1377 |
+
}
|
| 1378 |
+
const overrides = req.continueRequestOverrides();
|
| 1379 |
+
|
| 1380 |
+
return req.continue(overrides, 0);
|
| 1381 |
+
}
|
| 1382 |
+
}
|
| 1383 |
+
const sideload = options.sideLoad;
|
| 1384 |
+
|
| 1385 |
+
const impersonate = sideload?.impersonate[reqUrlParsed.href];
|
| 1386 |
+
if (impersonate) {
|
| 1387 |
+
let body;
|
| 1388 |
+
if (impersonate.body) {
|
| 1389 |
+
body = await readFile(await impersonate.body.filePath);
|
| 1390 |
+
if (req.isInterceptResolutionHandled()) {
|
| 1391 |
+
return;
|
| 1392 |
+
}
|
| 1393 |
+
}
|
| 1394 |
+
return req.respond({
|
| 1395 |
+
status: impersonate.status,
|
| 1396 |
+
headers: impersonate.headers,
|
| 1397 |
+
contentType: impersonate.contentType,
|
| 1398 |
+
body: body ? Uint8Array.from(body) : undefined,
|
| 1399 |
+
}, 999);
|
| 1400 |
+
}
|
| 1401 |
+
|
| 1402 |
+
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
| 1403 |
+
const ctx = this.lifeCycleTrack.get(page);
|
| 1404 |
+
if (proxy && ctx) {
|
| 1405 |
+
return await this.asyncLocalContext.bridge(ctx, async () => {
|
| 1406 |
+
try {
|
| 1407 |
+
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
| 1408 |
+
...options,
|
| 1409 |
+
method: req.method(),
|
| 1410 |
+
body: req.postData(),
|
| 1411 |
+
extraHeaders: {
|
| 1412 |
+
...req.headers(),
|
| 1413 |
+
...options.extraHeaders,
|
| 1414 |
+
},
|
| 1415 |
+
proxyUrl: proxy
|
| 1416 |
+
});
|
| 1417 |
+
if (req.isInterceptResolutionHandled()) {
|
| 1418 |
+
return;
|
| 1419 |
+
};
|
| 1420 |
+
|
| 1421 |
+
if (curled.chain.length === 1) {
|
| 1422 |
+
if (!curled.file) {
|
| 1423 |
+
return req.respond({
|
| 1424 |
+
status: curled.status,
|
| 1425 |
+
headers: _.omit(curled.headers, 'result'),
|
| 1426 |
+
contentType: curled.contentType,
|
| 1427 |
+
}, 3);
|
| 1428 |
+
}
|
| 1429 |
+
const body = await readFile(await curled.file.filePath);
|
| 1430 |
+
if (req.isInterceptResolutionHandled()) {
|
| 1431 |
+
return;
|
| 1432 |
+
};
|
| 1433 |
+
return req.respond({
|
| 1434 |
+
status: curled.status,
|
| 1435 |
+
headers: _.omit(curled.headers, 'result'),
|
| 1436 |
+
contentType: curled.contentType,
|
| 1437 |
+
body: Uint8Array.from(body),
|
| 1438 |
+
}, 3);
|
| 1439 |
+
}
|
| 1440 |
+
options.sideLoad ??= curled.sideLoadOpts;
|
| 1441 |
+
_.merge(options.sideLoad, curled.sideLoadOpts);
|
| 1442 |
+
const firstReq = curled.chain[0];
|
| 1443 |
+
|
| 1444 |
+
return req.respond({
|
| 1445 |
+
status: firstReq.result!.code,
|
| 1446 |
+
headers: _.omit(firstReq, 'result'),
|
| 1447 |
+
}, 3);
|
| 1448 |
+
} catch (err: any) {
|
| 1449 |
+
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
| 1450 |
+
}
|
| 1451 |
+
if (req.isInterceptResolutionHandled()) {
|
| 1452 |
+
return;
|
| 1453 |
+
};
|
| 1454 |
+
const overrides = req.continueRequestOverrides();
|
| 1455 |
+
const continueArgs = [{
|
| 1456 |
+
...overrides,
|
| 1457 |
+
headers: {
|
| 1458 |
+
...req.headers(),
|
| 1459 |
+
...overrides?.headers,
|
| 1460 |
+
...options.extraHeaders,
|
| 1461 |
+
}
|
| 1462 |
+
}, 1] as const;
|
| 1463 |
+
|
| 1464 |
+
return req.continue(continueArgs[0], continueArgs[1]);
|
| 1465 |
+
});
|
| 1466 |
+
}
|
| 1467 |
+
|
| 1468 |
+
if (req.isInterceptResolutionHandled()) {
|
| 1469 |
+
return;
|
| 1470 |
+
};
|
| 1471 |
+
const overrides = req.continueRequestOverrides();
|
| 1472 |
+
const continueArgs = [{
|
| 1473 |
+
...overrides,
|
| 1474 |
+
headers: {
|
| 1475 |
+
...req.headers(),
|
| 1476 |
+
...overrides?.headers,
|
| 1477 |
+
...options.extraHeaders,
|
| 1478 |
+
}
|
| 1479 |
+
}, 1] as const;
|
| 1480 |
+
|
| 1481 |
+
return req.continue(continueArgs[0], continueArgs[1]);
|
| 1482 |
+
});
|
| 1483 |
+
|
| 1484 |
+
const sn = this.snMap.get(page);
|
| 1485 |
+
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 1486 |
+
if (options.locale) {
|
| 1487 |
+
// Add headers via request interception to walk around this bug
|
| 1488 |
+
// https://github.com/puppeteer/puppeteer/issues/10235
|
| 1489 |
+
// await page.setExtraHTTPHeaders({
|
| 1490 |
+
// 'Accept-Language': options.locale
|
| 1491 |
+
// });
|
| 1492 |
+
|
| 1493 |
+
await page.evaluateOnNewDocument(() => {
|
| 1494 |
+
Object.defineProperty(navigator, "language", {
|
| 1495 |
+
get: function () {
|
| 1496 |
+
return options.locale;
|
| 1497 |
+
}
|
| 1498 |
+
});
|
| 1499 |
+
Object.defineProperty(navigator, "languages", {
|
| 1500 |
+
get: function () {
|
| 1501 |
+
return [options.locale];
|
| 1502 |
+
}
|
| 1503 |
+
});
|
| 1504 |
+
});
|
| 1505 |
+
}
|
| 1506 |
+
|
| 1507 |
+
if (options.cookies) {
|
| 1508 |
+
const mapped = options.cookies.map((x) => {
|
| 1509 |
+
const draft: CookieParam = {
|
| 1510 |
+
name: x.name,
|
| 1511 |
+
value: encodeURIComponent(x.value),
|
| 1512 |
+
secure: x.secure,
|
| 1513 |
+
domain: x.domain,
|
| 1514 |
+
path: x.path,
|
| 1515 |
+
expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
|
| 1516 |
+
sameSite: x.sameSite as any,
|
| 1517 |
+
};
|
| 1518 |
+
if (!draft.expires && x.maxAge) {
|
| 1519 |
+
draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
|
| 1520 |
+
}
|
| 1521 |
+
if (!draft.domain) {
|
| 1522 |
+
draft.url = parsedUrl.toString();
|
| 1523 |
+
}
|
| 1524 |
+
|
| 1525 |
+
return draft;
|
| 1526 |
+
});
|
| 1527 |
+
try {
|
| 1528 |
+
await page.setCookie(...mapped);
|
| 1529 |
+
} catch (err: any) {
|
| 1530 |
+
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
|
| 1531 |
+
throw new ParamValidationError({
|
| 1532 |
+
path: 'cookies',
|
| 1533 |
+
message: `Failed to set cookies: ${err?.message}`
|
| 1534 |
+
});
|
| 1535 |
+
}
|
| 1536 |
+
}
|
| 1537 |
+
if (options.overrideUserAgent) {
|
| 1538 |
+
await page.setUserAgent(options.overrideUserAgent);
|
| 1539 |
+
}
|
| 1540 |
+
if (options.viewport) {
|
| 1541 |
+
await page.setViewport(options.viewport);
|
| 1542 |
+
}
|
| 1543 |
+
|
| 1544 |
+
let nextSnapshotDeferred = Defer();
|
| 1545 |
+
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
|
| 1546 |
+
this.once('crippled', crippleListener);
|
| 1547 |
+
nextSnapshotDeferred.promise.finally(() => {
|
| 1548 |
+
this.off('crippled', crippleListener);
|
| 1549 |
+
});
|
| 1550 |
+
let finalized = false;
|
| 1551 |
+
const hdl = (s: any) => {
|
| 1552 |
+
if (snapshot === s) {
|
| 1553 |
+
return;
|
| 1554 |
+
}
|
| 1555 |
+
snapshot = s;
|
| 1556 |
+
if (snapshot) {
|
| 1557 |
+
const kit = this.pageReqCtrl.get(page);
|
| 1558 |
+
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
|
| 1559 |
+
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
|
| 1560 |
+
}
|
| 1561 |
+
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
| 1562 |
+
return;
|
| 1563 |
+
}
|
| 1564 |
+
if (s?.elemCount && s.elemCount > 10_000) {
|
| 1565 |
+
return;
|
| 1566 |
+
}
|
| 1567 |
+
nextSnapshotDeferred.resolve(s);
|
| 1568 |
+
nextSnapshotDeferred = Defer();
|
| 1569 |
+
this.once('crippled', crippleListener);
|
| 1570 |
+
nextSnapshotDeferred.promise.finally(() => {
|
| 1571 |
+
this.off('crippled', crippleListener);
|
| 1572 |
+
});
|
| 1573 |
+
};
|
| 1574 |
+
page.on('snapshot', hdl);
|
| 1575 |
+
page.once('abuse', (event: any) => {
|
| 1576 |
+
this.emit('abuse', { ...event, url: parsedUrl });
|
| 1577 |
+
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
|
| 1578 |
+
this.emit('abuse', { ...event, url: snapshot.href });
|
| 1579 |
+
}
|
| 1580 |
+
|
| 1581 |
+
nextSnapshotDeferred.reject(
|
| 1582 |
+
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
| 1583 |
+
);
|
| 1584 |
+
});
|
| 1585 |
+
|
| 1586 |
+
const timeout = options.timeoutMs || 30_000;
|
| 1587 |
+
const goToOptions: GoToOptions = {
|
| 1588 |
+
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
| 1589 |
+
timeout,
|
| 1590 |
+
};
|
| 1591 |
+
|
| 1592 |
+
if (options.referer) {
|
| 1593 |
+
goToOptions.referer = options.referer;
|
| 1594 |
+
}
|
| 1595 |
+
|
| 1596 |
+
const gotoPromise = page.goto(url, goToOptions)
|
| 1597 |
+
.catch((err) => {
|
| 1598 |
+
if (err instanceof TimeoutError) {
|
| 1599 |
+
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
|
| 1600 |
+
return new AssertionFailureError({
|
| 1601 |
+
message: `Failed to goto ${url}: ${err}`,
|
| 1602 |
+
cause: err,
|
| 1603 |
+
});
|
| 1604 |
+
}
|
| 1605 |
+
|
| 1606 |
+
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
|
| 1607 |
+
return new AssertionFailureError({
|
| 1608 |
+
message: `Failed to goto ${url}: ${err}`,
|
| 1609 |
+
cause: err,
|
| 1610 |
+
});
|
| 1611 |
+
}).then(async (stuff) => {
|
| 1612 |
+
// This check is necessary because without snapshot, the condition of the page is unclear
|
| 1613 |
+
// Calling evaluate directly may stall the process.
|
| 1614 |
+
if (!snapshot) {
|
| 1615 |
+
if (stuff instanceof Error) {
|
| 1616 |
+
finalized = true;
|
| 1617 |
+
throw stuff;
|
| 1618 |
+
}
|
| 1619 |
+
}
|
| 1620 |
+
try {
|
| 1621 |
+
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 1622 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1623 |
+
if (snapshot) {
|
| 1624 |
+
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1625 |
+
}
|
| 1626 |
+
} catch (err: any) {
|
| 1627 |
+
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
| 1628 |
+
if (stuff instanceof Error) {
|
| 1629 |
+
finalized = true;
|
| 1630 |
+
throw stuff;
|
| 1631 |
+
}
|
| 1632 |
+
}
|
| 1633 |
+
if (!snapshot?.html) {
|
| 1634 |
+
if (stuff instanceof Error) {
|
| 1635 |
+
finalized = true;
|
| 1636 |
+
throw stuff;
|
| 1637 |
+
}
|
| 1638 |
+
}
|
| 1639 |
+
|
| 1640 |
+
finalized = true;
|
| 1641 |
+
if (snapshot?.html) {
|
| 1642 |
+
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 1643 |
+
this.emit(
|
| 1644 |
+
'crawled',
|
| 1645 |
+
{
|
| 1646 |
+
...snapshot,
|
| 1647 |
+
status: navigationResponse?.status(),
|
| 1648 |
+
statusText: navigationResponse?.statusText(),
|
| 1649 |
+
},
|
| 1650 |
+
{ ...options, url: parsedUrl }
|
| 1651 |
+
);
|
| 1652 |
+
}
|
| 1653 |
+
});
|
| 1654 |
+
|
| 1655 |
+
try {
|
| 1656 |
+
while (true) {
|
| 1657 |
+
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
| 1658 |
+
if (options.minIntervalMs) {
|
| 1659 |
+
ckpt.push(delay(options.minIntervalMs));
|
| 1660 |
+
}
|
| 1661 |
+
let error;
|
| 1662 |
+
await Promise.race(ckpt).catch((err) => error = err);
|
| 1663 |
+
if (finalized && !error) {
|
| 1664 |
+
if (!snapshot) {
|
| 1665 |
+
if (error) {
|
| 1666 |
+
throw error;
|
| 1667 |
+
}
|
| 1668 |
+
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 1669 |
+
}
|
| 1670 |
+
return {
|
| 1671 |
+
...snapshot,
|
| 1672 |
+
status: navigationResponse?.status(),
|
| 1673 |
+
statusText: navigationResponse?.statusText(),
|
| 1674 |
+
} as PageSnapshot;
|
| 1675 |
+
}
|
| 1676 |
+
|
| 1677 |
+
if (snapshot?.lastMutationIdle) {
|
| 1678 |
+
return {
|
| 1679 |
+
...snapshot,
|
| 1680 |
+
status: navigationResponse?.status(),
|
| 1681 |
+
statusText: navigationResponse?.statusText(),
|
| 1682 |
+
} as PageSnapshot;
|
| 1683 |
+
}
|
| 1684 |
+
if (error) {
|
| 1685 |
+
throw error;
|
| 1686 |
+
}
|
| 1687 |
+
}
|
| 1688 |
+
} finally {
|
| 1689 |
+
this.pagePhase.set(page, 'background');
|
| 1690 |
+
page.off('snapshot', hdl);
|
| 1691 |
+
this.ditchPage(page);
|
| 1692 |
+
nextSnapshotDeferred.resolve();
|
| 1693 |
+
}
|
| 1694 |
+
}
|
| 1695 |
+
|
| 1696 |
}
|
| 1697 |
|
| 1698 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 07d23193d85b1d3c8bbd5d0b024a6884ecfe17fd
|