"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.CurlControl = void 0; const async_service_1 = require("civkit/async-service"); const tsyringe_1 = require("tsyringe"); const node_libcurl_1 = require("node-libcurl"); const set_cookie_parser_1 = require("set-cookie-parser"); const logger_1 = require("./logger"); const civkit_1 = require("civkit"); const errors_1 = require("./errors"); const temp_file_1 = require("../services/temp-file"); const zlib_1 = require("zlib"); const simple_zstd_1 = require("simple-zstd"); const lodash_1 = __importDefault(require("lodash")); const async_context_1 = require("./async-context"); const blackhole_detector_1 = require("./blackhole-detector"); let CurlControl = class CurlControl extends async_service_1.AsyncService { constructor(globalLogger, tempFileManager, asyncLocalContext, blackHoleDetector) { super(...arguments); this.globalLogger = globalLogger; this.tempFileManager = tempFileManager; this.asyncLocalContext = asyncLocalContext; this.blackHoleDetector = blackHoleDetector; this.logger = this.globalLogger.child({ service: this.constructor.name }); this.chromeVersion = `132`; this.safariVersion = `537.36`; this.platform = `Linux`; this.ua = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`; this.lifeCycleTrack = new WeakMap(); } async init() { await this.dependencyReady(); if (process.platform === 'darwin') { this.platform = `macOS`; } else if (process.platform === 'win32') { this.platform = `Windows`; } this.emit('ready'); } impersonateChrome(ua) { this.chromeVersion = ua.match(/Chrome\/(\d+)/)[1]; this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)[1]; this.ua = ua; } curlImpersonateHeader(curl, headers) { let uaPlatform = this.platform; if (this.ua.includes('Windows')) { uaPlatform = 'Windows'; } else if (this.ua.includes('Android')) { uaPlatform = 'Android'; } else if (this.ua.includes('iPhone') || this.ua.includes('iPad') || this.ua.includes('iPod')) { uaPlatform = 'iOS'; } else if (this.ua.includes('CrOS')) { uaPlatform = 'Chrome OS'; } else if (this.ua.includes('Macintosh')) { uaPlatform = 'macOS'; } const mixinHeaders = { 'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': `"${uaPlatform}"`, 'Upgrade-Insecure-Requests': '1', 'User-Agent': this.ua, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document', 'Accept-Encoding': 'gzip, deflate, br, zstd', 'Accept-Language': 'en-US,en;q=0.9', }; const headersCopy = { ...headers }; for (const k of Object.keys(mixinHeaders)) { const lowerK = k.toLowerCase(); if (headersCopy[lowerK]) { mixinHeaders[k] = headersCopy[lowerK]; delete headersCopy[lowerK]; } } Object.assign(mixinHeaders, headersCopy); curl.setOpt(node_libcurl_1.Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => { if (Array.isArray(v) && v.length) { return v.map((v2) => `${k}: ${v2}`); } return [`${k}: ${v}`]; })); return curl; } urlToFile1Shot(urlToCrawl, crawlOpts) { return new Promise((resolve, reject) => { let contentType = ''; const curl = new node_libcurl_1.Curl(); curl.enable(node_libcurl_1.CurlFeature.StreamResponse); curl.setOpt('URL', urlToCrawl.toString()); curl.setOpt(node_libcurl_1.Curl.option.FOLLOWLOCATION, false); curl.setOpt(node_libcurl_1.Curl.option.SSL_VERIFYPEER, false); curl.setOpt(node_libcurl_1.Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000); curl.setOpt(node_libcurl_1.Curl.option.CONNECTTIMEOUT_MS, 3_000); curl.setOpt(node_libcurl_1.Curl.option.LOW_SPEED_LIMIT, 32768); curl.setOpt(node_libcurl_1.Curl.option.LOW_SPEED_TIME, 5_000); if (crawlOpts?.method) { curl.setOpt(node_libcurl_1.Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase()); } if (crawlOpts?.body) { curl.setOpt(node_libcurl_1.Curl.option.POSTFIELDS, crawlOpts.body.toString()); } const headersToSet = { ...crawlOpts?.extraHeaders }; if (crawlOpts?.cookies?.length) { const cookieKv = {}; for (const cookie of crawlOpts.cookies) { cookieKv[cookie.name] = cookie.value; } for (const cookie of crawlOpts.cookies) { if (cookie.maxAge && cookie.maxAge < 0) { delete cookieKv[cookie.name]; continue; } if (cookie.expires && cookie.expires < new Date()) { delete cookieKv[cookie.name]; continue; } if (cookie.secure && urlToCrawl.protocol !== 'https:') { delete cookieKv[cookie.name]; continue; } if (cookie.domain && !urlToCrawl.hostname.endsWith(cookie.domain)) { delete cookieKv[cookie.name]; continue; } if (cookie.path && !urlToCrawl.pathname.startsWith(cookie.path)) { delete cookieKv[cookie.name]; continue; } } const cookieChunks = Object.entries(cookieKv).map(([k, v]) => `${k}=${encodeURIComponent(v)}`); headersToSet.cookie ??= cookieChunks.join('; '); } if (crawlOpts?.referer) { headersToSet.referer ??= crawlOpts.referer; } if (crawlOpts?.overrideUserAgent) { headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent; } this.curlImpersonateHeader(curl, headersToSet); if (crawlOpts?.proxyUrl) { const proxyUrlCopy = new URL(crawlOpts.proxyUrl); curl.setOpt(node_libcurl_1.Curl.option.PROXY, proxyUrlCopy.href); } let curlStream; curl.on('error', (err, errCode) => { curl.close(); this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err, urlToCrawl }); const err2 = this.digestCurlCode(errCode, err.message) || new civkit_1.AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`); err2.cause ??= err; if (curlStream) { // For some reason, manually emitting error event is required for curlStream. curlStream.emit('error', err2); curlStream.destroy(err2); } reject(err2); }); curl.setOpt(node_libcurl_1.Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB let status = -1; let statusText; let contentEncoding = ''; curl.once('end', () => { if (curlStream) { curlStream.once('end', () => curl.close()); return; } curl.close(); }); curl.on('stream', (stream, statusCode, headers) => { this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode }); status = statusCode; curlStream = stream; for (const headerSet of headers) { for (const [k, v] of Object.entries(headerSet)) { if (k.trim().endsWith(':')) { Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || ''); Reflect.deleteProperty(headerSet, k); continue; } if (v === undefined) { Reflect.set(headerSet, k, ''); continue; } if (k.toLowerCase() === 'content-type' && typeof v === 'string') { contentType = v.toLowerCase(); } } } const lastResHeaders = headers[headers.length - 1]; statusText = lastResHeaders.result?.reason; for (const [k, v] of Object.entries(lastResHeaders)) { const kl = k.toLowerCase(); if (kl === 'content-type') { contentType = (v || '').toLowerCase(); } if (kl === 'content-encoding') { contentEncoding = (v || '').toLowerCase(); } if (contentType && contentEncoding) { break; } } if ([301, 302, 303, 307, 308].includes(statusCode)) { if (stream) { stream.resume(); } resolve({ statusCode: status, statusText, data: undefined, headers: headers, }); return; } if (!stream) { resolve({ statusCode: status, statusText, data: undefined, headers: headers, }); return; } switch (contentEncoding) { case 'gzip': { const decompressed = (0, zlib_1.createGunzip)(); stream.pipe(decompressed); stream.once('error', (err) => { decompressed.destroy(err); }); stream = decompressed; break; } case 'deflate': { const decompressed = (0, zlib_1.createInflate)(); stream.pipe(decompressed); stream.once('error', (err) => { decompressed.destroy(err); }); stream = decompressed; break; } case 'br': { const decompressed = (0, zlib_1.createBrotliDecompress)(); stream.pipe(decompressed); stream.once('error', (err) => { decompressed.destroy(err); }); stream = decompressed; break; } case 'zstd': { const decompressed = (0, simple_zstd_1.ZSTDDecompress)(); stream.pipe(decompressed); stream.once('error', (err) => { decompressed.destroy(err); }); stream = decompressed; break; } default: { break; } } const fpath = this.tempFileManager.alloc(); const fancyFile = civkit_1.FancyFile.auto(stream, fpath); this.tempFileManager.bindPathTo(fancyFile, fpath); resolve({ statusCode: status, statusText, data: fancyFile, headers: headers, }); }); curl.perform(); }); } async urlToFile(urlToCrawl, crawlOpts) { let leftRedirection = 6; let cookieRedirects = 0; let opts = { ...crawlOpts }; let nextHopUrl = urlToCrawl; const fakeHeaderInfos = []; do { const r = await this.urlToFile1Shot(nextHopUrl, opts); if ([301, 302, 303, 307, 308].includes(r.statusCode)) { fakeHeaderInfos.push(...r.headers); const headers = r.headers[r.headers.length - 1]; const location = headers.Location || headers.location; const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie']; if (setCookieHeader) { const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader]; const parsed = cookieAssignments.filter(Boolean).map((x) => (0, set_cookie_parser_1.parseString)(x, { decodeValues: true })); if (parsed.length) { opts.cookies = [...(opts.cookies || []), ...parsed]; } if (!location) { cookieRedirects += 1; } } if (!location && !setCookieHeader) { // Follow curl behavior return { statusCode: r.statusCode, data: r.data, headers: fakeHeaderInfos.concat(r.headers), }; } if (!location && cookieRedirects > 1) { throw new errors_1.ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`); } nextHopUrl = new URL(location || '', nextHopUrl); leftRedirection -= 1; continue; } return { statusCode: r.statusCode, statusText: r.statusText, data: r.data, headers: fakeHeaderInfos.concat(r.headers), }; } while (leftRedirection > 0); throw new errors_1.ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`); } async sideLoad(targetUrl, crawlOpts) { const curlResult = await this.urlToFile(targetUrl, crawlOpts); this.blackHoleDetector.itWorked(); let finalURL = targetUrl; const sideLoadOpts = { impersonate: {}, proxyOrigin: {}, }; for (const headers of curlResult.headers) { sideLoadOpts.impersonate[finalURL.href] = { status: headers.result?.code || -1, headers: lodash_1.default.omit(headers, 'result'), contentType: headers['Content-Type'] || headers['content-type'], }; if (crawlOpts?.proxyUrl) { sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl; } if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) { const location = headers.Location || headers.location; if (location) { finalURL = new URL(location, finalURL); } } } const lastHeaders = curlResult.headers[curlResult.headers.length - 1]; const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream'; const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition']; const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop(); if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) { sideLoadOpts.impersonate[finalURL.href].body = curlResult.data; } // This should keep the file from being garbage collected and deleted until this asyncContext/request is done. this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data); return { finalURL, sideLoadOpts, chain: curlResult.headers, status: curlResult.statusCode, statusText: curlResult.statusText, headers: lastHeaders, contentType, contentDisposition, fileName, file: curlResult.data }; } digestCurlCode(code, msg) { switch (code) { // 400 User errors case node_libcurl_1.CurlCode.CURLE_COULDNT_RESOLVE_HOST: { return new civkit_1.AssertionFailureError(msg); } // Maybe retry but dont retry with curl again case node_libcurl_1.CurlCode.CURLE_OPERATION_TIMEDOUT: case node_libcurl_1.CurlCode.CURLE_UNSUPPORTED_PROTOCOL: case node_libcurl_1.CurlCode.CURLE_PEER_FAILED_VERIFICATION: { return new errors_1.ServiceBadApproachError(msg); } // Retryable errors case node_libcurl_1.CurlCode.CURLE_REMOTE_ACCESS_DENIED: case node_libcurl_1.CurlCode.CURLE_SEND_ERROR: case node_libcurl_1.CurlCode.CURLE_RECV_ERROR: case node_libcurl_1.CurlCode.CURLE_GOT_NOTHING: case node_libcurl_1.CurlCode.CURLE_SSL_CONNECT_ERROR: case node_libcurl_1.CurlCode.CURLE_QUIC_CONNECT_ERROR: case node_libcurl_1.CurlCode.CURLE_COULDNT_RESOLVE_PROXY: case node_libcurl_1.CurlCode.CURLE_COULDNT_CONNECT: case node_libcurl_1.CurlCode.CURLE_PARTIAL_FILE: { return new errors_1.ServiceBadAttemptError(msg); } default: { return undefined; } } } }; exports.CurlControl = CurlControl; exports.CurlControl = CurlControl = __decorate([ (0, tsyringe_1.singleton)(), __metadata("design:paramtypes", [logger_1.GlobalLogger, temp_file_1.TempFileManager, async_context_1.AsyncLocalContext, blackhole_detector_1.BlackHoleDetector]) ], CurlControl); //# sourceMappingURL=curl.js.map