Spaces:
Build error
Build error
| ; | |
| var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { | |
| var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; | |
| if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); | |
| else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; | |
| return c > 3 && r && Object.defineProperty(target, key, r), r; | |
| }; | |
| var __metadata = (this && this.__metadata) || function (k, v) { | |
| if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); | |
| }; | |
| var __importDefault = (this && this.__importDefault) || function (mod) { | |
| return (mod && mod.__esModule) ? mod : { "default": mod }; | |
| }; | |
| Object.defineProperty(exports, "__esModule", { value: true }); | |
| exports.CurlControl = void 0; | |
| const async_service_1 = require("civkit/async-service"); | |
| const tsyringe_1 = require("tsyringe"); | |
| const node_libcurl_1 = require("node-libcurl"); | |
| const set_cookie_parser_1 = require("set-cookie-parser"); | |
| const logger_1 = require("./logger"); | |
| const civkit_1 = require("civkit"); | |
| const errors_1 = require("./errors"); | |
| const temp_file_1 = require("../services/temp-file"); | |
| const zlib_1 = require("zlib"); | |
| const simple_zstd_1 = require("simple-zstd"); | |
| const lodash_1 = __importDefault(require("lodash")); | |
| const async_context_1 = require("./async-context"); | |
| const blackhole_detector_1 = require("./blackhole-detector"); | |
| let CurlControl = class CurlControl extends async_service_1.AsyncService { | |
| constructor(globalLogger, tempFileManager, asyncLocalContext, blackHoleDetector) { | |
| super(...arguments); | |
| this.globalLogger = globalLogger; | |
| this.tempFileManager = tempFileManager; | |
| this.asyncLocalContext = asyncLocalContext; | |
| this.blackHoleDetector = blackHoleDetector; | |
| this.logger = this.globalLogger.child({ service: this.constructor.name }); | |
| this.chromeVersion = `132`; | |
| this.safariVersion = `537.36`; | |
| this.platform = `Linux`; | |
| this.ua = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`; | |
| this.lifeCycleTrack = new WeakMap(); | |
| } | |
| async init() { | |
| await this.dependencyReady(); | |
| if (process.platform === 'darwin') { | |
| this.platform = `macOS`; | |
| } | |
| else if (process.platform === 'win32') { | |
| this.platform = `Windows`; | |
| } | |
| this.emit('ready'); | |
| } | |
| impersonateChrome(ua) { | |
| this.chromeVersion = ua.match(/Chrome\/(\d+)/)[1]; | |
| this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)[1]; | |
| this.ua = ua; | |
| } | |
| curlImpersonateHeader(curl, headers) { | |
| let uaPlatform = this.platform; | |
| if (this.ua.includes('Windows')) { | |
| uaPlatform = 'Windows'; | |
| } | |
| else if (this.ua.includes('Android')) { | |
| uaPlatform = 'Android'; | |
| } | |
| else if (this.ua.includes('iPhone') || this.ua.includes('iPad') || this.ua.includes('iPod')) { | |
| uaPlatform = 'iOS'; | |
| } | |
| else if (this.ua.includes('CrOS')) { | |
| uaPlatform = 'Chrome OS'; | |
| } | |
| else if (this.ua.includes('Macintosh')) { | |
| uaPlatform = 'macOS'; | |
| } | |
| const mixinHeaders = { | |
| 'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, | |
| 'Sec-Ch-Ua-Mobile': '?0', | |
| 'Sec-Ch-Ua-Platform': `"${uaPlatform}"`, | |
| 'Upgrade-Insecure-Requests': '1', | |
| 'User-Agent': this.ua, | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
| 'Sec-Fetch-Site': 'none', | |
| 'Sec-Fetch-Mode': 'navigate', | |
| 'Sec-Fetch-User': '?1', | |
| 'Sec-Fetch-Dest': 'document', | |
| 'Accept-Encoding': 'gzip, deflate, br, zstd', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| }; | |
| const headersCopy = { ...headers }; | |
| for (const k of Object.keys(mixinHeaders)) { | |
| const lowerK = k.toLowerCase(); | |
| if (headersCopy[lowerK]) { | |
| mixinHeaders[k] = headersCopy[lowerK]; | |
| delete headersCopy[lowerK]; | |
| } | |
| } | |
| Object.assign(mixinHeaders, headersCopy); | |
| curl.setOpt(node_libcurl_1.Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => { | |
| if (Array.isArray(v) && v.length) { | |
| return v.map((v2) => `${k}: ${v2}`); | |
| } | |
| return [`${k}: ${v}`]; | |
| })); | |
| return curl; | |
| } | |
| urlToFile1Shot(urlToCrawl, crawlOpts) { | |
| return new Promise((resolve, reject) => { | |
| let contentType = ''; | |
| const curl = new node_libcurl_1.Curl(); | |
| curl.enable(node_libcurl_1.CurlFeature.StreamResponse); | |
| curl.setOpt('URL', urlToCrawl.toString()); | |
| curl.setOpt(node_libcurl_1.Curl.option.FOLLOWLOCATION, false); | |
| curl.setOpt(node_libcurl_1.Curl.option.SSL_VERIFYPEER, false); | |
| curl.setOpt(node_libcurl_1.Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000); | |
| curl.setOpt(node_libcurl_1.Curl.option.CONNECTTIMEOUT_MS, 3_000); | |
| curl.setOpt(node_libcurl_1.Curl.option.LOW_SPEED_LIMIT, 32768); | |
| curl.setOpt(node_libcurl_1.Curl.option.LOW_SPEED_TIME, 5_000); | |
| if (crawlOpts?.method) { | |
| curl.setOpt(node_libcurl_1.Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase()); | |
| } | |
| if (crawlOpts?.body) { | |
| curl.setOpt(node_libcurl_1.Curl.option.POSTFIELDS, crawlOpts.body.toString()); | |
| } | |
| const headersToSet = { ...crawlOpts?.extraHeaders }; | |
| if (crawlOpts?.cookies?.length) { | |
| const cookieKv = {}; | |
| for (const cookie of crawlOpts.cookies) { | |
| cookieKv[cookie.name] = cookie.value; | |
| } | |
| for (const cookie of crawlOpts.cookies) { | |
| if (cookie.maxAge && cookie.maxAge < 0) { | |
| delete cookieKv[cookie.name]; | |
| continue; | |
| } | |
| if (cookie.expires && cookie.expires < new Date()) { | |
| delete cookieKv[cookie.name]; | |
| continue; | |
| } | |
| if (cookie.secure && urlToCrawl.protocol !== 'https:') { | |
| delete cookieKv[cookie.name]; | |
| continue; | |
| } | |
| if (cookie.domain && !urlToCrawl.hostname.endsWith(cookie.domain)) { | |
| delete cookieKv[cookie.name]; | |
| continue; | |
| } | |
| if (cookie.path && !urlToCrawl.pathname.startsWith(cookie.path)) { | |
| delete cookieKv[cookie.name]; | |
| continue; | |
| } | |
| } | |
| const cookieChunks = Object.entries(cookieKv).map(([k, v]) => `${k}=${encodeURIComponent(v)}`); | |
| headersToSet.cookie ??= cookieChunks.join('; '); | |
| } | |
| if (crawlOpts?.referer) { | |
| headersToSet.referer ??= crawlOpts.referer; | |
| } | |
| if (crawlOpts?.overrideUserAgent) { | |
| headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent; | |
| } | |
| this.curlImpersonateHeader(curl, headersToSet); | |
| if (crawlOpts?.proxyUrl) { | |
| const proxyUrlCopy = new URL(crawlOpts.proxyUrl); | |
| curl.setOpt(node_libcurl_1.Curl.option.PROXY, proxyUrlCopy.href); | |
| } | |
| let curlStream; | |
| curl.on('error', (err, errCode) => { | |
| curl.close(); | |
| this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err, urlToCrawl }); | |
| const err2 = this.digestCurlCode(errCode, err.message) || | |
| new civkit_1.AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`); | |
| err2.cause ??= err; | |
| if (curlStream) { | |
| // For some reason, manually emitting error event is required for curlStream. | |
| curlStream.emit('error', err2); | |
| curlStream.destroy(err2); | |
| } | |
| reject(err2); | |
| }); | |
| curl.setOpt(node_libcurl_1.Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB | |
| let status = -1; | |
| let statusText; | |
| let contentEncoding = ''; | |
| curl.once('end', () => { | |
| if (curlStream) { | |
| curlStream.once('end', () => curl.close()); | |
| return; | |
| } | |
| curl.close(); | |
| }); | |
| curl.on('stream', (stream, statusCode, headers) => { | |
| this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode }); | |
| status = statusCode; | |
| curlStream = stream; | |
| for (const headerSet of headers) { | |
| for (const [k, v] of Object.entries(headerSet)) { | |
| if (k.trim().endsWith(':')) { | |
| Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || ''); | |
| Reflect.deleteProperty(headerSet, k); | |
| continue; | |
| } | |
| if (v === undefined) { | |
| Reflect.set(headerSet, k, ''); | |
| continue; | |
| } | |
| if (k.toLowerCase() === 'content-type' && typeof v === 'string') { | |
| contentType = v.toLowerCase(); | |
| } | |
| } | |
| } | |
| const lastResHeaders = headers[headers.length - 1]; | |
| statusText = lastResHeaders.result?.reason; | |
| for (const [k, v] of Object.entries(lastResHeaders)) { | |
| const kl = k.toLowerCase(); | |
| if (kl === 'content-type') { | |
| contentType = (v || '').toLowerCase(); | |
| } | |
| if (kl === 'content-encoding') { | |
| contentEncoding = (v || '').toLowerCase(); | |
| } | |
| if (contentType && contentEncoding) { | |
| break; | |
| } | |
| } | |
| if ([301, 302, 303, 307, 308].includes(statusCode)) { | |
| if (stream) { | |
| stream.resume(); | |
| } | |
| resolve({ | |
| statusCode: status, | |
| statusText, | |
| data: undefined, | |
| headers: headers, | |
| }); | |
| return; | |
| } | |
| if (!stream) { | |
| resolve({ | |
| statusCode: status, | |
| statusText, | |
| data: undefined, | |
| headers: headers, | |
| }); | |
| return; | |
| } | |
| switch (contentEncoding) { | |
| case 'gzip': { | |
| const decompressed = (0, zlib_1.createGunzip)(); | |
| stream.pipe(decompressed); | |
| stream.once('error', (err) => { | |
| decompressed.destroy(err); | |
| }); | |
| stream = decompressed; | |
| break; | |
| } | |
| case 'deflate': { | |
| const decompressed = (0, zlib_1.createInflate)(); | |
| stream.pipe(decompressed); | |
| stream.once('error', (err) => { | |
| decompressed.destroy(err); | |
| }); | |
| stream = decompressed; | |
| break; | |
| } | |
| case 'br': { | |
| const decompressed = (0, zlib_1.createBrotliDecompress)(); | |
| stream.pipe(decompressed); | |
| stream.once('error', (err) => { | |
| decompressed.destroy(err); | |
| }); | |
| stream = decompressed; | |
| break; | |
| } | |
| case 'zstd': { | |
| const decompressed = (0, simple_zstd_1.ZSTDDecompress)(); | |
| stream.pipe(decompressed); | |
| stream.once('error', (err) => { | |
| decompressed.destroy(err); | |
| }); | |
| stream = decompressed; | |
| break; | |
| } | |
| default: { | |
| break; | |
| } | |
| } | |
| const fpath = this.tempFileManager.alloc(); | |
| const fancyFile = civkit_1.FancyFile.auto(stream, fpath); | |
| this.tempFileManager.bindPathTo(fancyFile, fpath); | |
| resolve({ | |
| statusCode: status, | |
| statusText, | |
| data: fancyFile, | |
| headers: headers, | |
| }); | |
| }); | |
| curl.perform(); | |
| }); | |
| } | |
| async urlToFile(urlToCrawl, crawlOpts) { | |
| let leftRedirection = 6; | |
| let cookieRedirects = 0; | |
| let opts = { ...crawlOpts }; | |
| let nextHopUrl = urlToCrawl; | |
| const fakeHeaderInfos = []; | |
| do { | |
| const r = await this.urlToFile1Shot(nextHopUrl, opts); | |
| if ([301, 302, 303, 307, 308].includes(r.statusCode)) { | |
| fakeHeaderInfos.push(...r.headers); | |
| const headers = r.headers[r.headers.length - 1]; | |
| const location = headers.Location || headers.location; | |
| const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie']; | |
| if (setCookieHeader) { | |
| const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader]; | |
| const parsed = cookieAssignments.filter(Boolean).map((x) => (0, set_cookie_parser_1.parseString)(x, { decodeValues: true })); | |
| if (parsed.length) { | |
| opts.cookies = [...(opts.cookies || []), ...parsed]; | |
| } | |
| if (!location) { | |
| cookieRedirects += 1; | |
| } | |
| } | |
| if (!location && !setCookieHeader) { | |
| // Follow curl behavior | |
| return { | |
| statusCode: r.statusCode, | |
| data: r.data, | |
| headers: fakeHeaderInfos.concat(r.headers), | |
| }; | |
| } | |
| if (!location && cookieRedirects > 1) { | |
| throw new errors_1.ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`); | |
| } | |
| nextHopUrl = new URL(location || '', nextHopUrl); | |
| leftRedirection -= 1; | |
| continue; | |
| } | |
| return { | |
| statusCode: r.statusCode, | |
| statusText: r.statusText, | |
| data: r.data, | |
| headers: fakeHeaderInfos.concat(r.headers), | |
| }; | |
| } while (leftRedirection > 0); | |
| throw new errors_1.ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`); | |
| } | |
| async sideLoad(targetUrl, crawlOpts) { | |
| const curlResult = await this.urlToFile(targetUrl, crawlOpts); | |
| this.blackHoleDetector.itWorked(); | |
| let finalURL = targetUrl; | |
| const sideLoadOpts = { | |
| impersonate: {}, | |
| proxyOrigin: {}, | |
| }; | |
| for (const headers of curlResult.headers) { | |
| sideLoadOpts.impersonate[finalURL.href] = { | |
| status: headers.result?.code || -1, | |
| headers: lodash_1.default.omit(headers, 'result'), | |
| contentType: headers['Content-Type'] || headers['content-type'], | |
| }; | |
| if (crawlOpts?.proxyUrl) { | |
| sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl; | |
| } | |
| if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) { | |
| const location = headers.Location || headers.location; | |
| if (location) { | |
| finalURL = new URL(location, finalURL); | |
| } | |
| } | |
| } | |
| const lastHeaders = curlResult.headers[curlResult.headers.length - 1]; | |
| const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream'; | |
| const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition']; | |
| const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop(); | |
| if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) { | |
| sideLoadOpts.impersonate[finalURL.href].body = curlResult.data; | |
| } | |
| // This should keep the file from being garbage collected and deleted until this asyncContext/request is done. | |
| this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data); | |
| return { | |
| finalURL, | |
| sideLoadOpts, | |
| chain: curlResult.headers, | |
| status: curlResult.statusCode, | |
| statusText: curlResult.statusText, | |
| headers: lastHeaders, | |
| contentType, | |
| contentDisposition, | |
| fileName, | |
| file: curlResult.data | |
| }; | |
| } | |
| digestCurlCode(code, msg) { | |
| switch (code) { | |
| // 400 User errors | |
| case node_libcurl_1.CurlCode.CURLE_COULDNT_RESOLVE_HOST: { | |
| return new civkit_1.AssertionFailureError(msg); | |
| } | |
| // Maybe retry but dont retry with curl again | |
| case node_libcurl_1.CurlCode.CURLE_OPERATION_TIMEDOUT: | |
| case node_libcurl_1.CurlCode.CURLE_UNSUPPORTED_PROTOCOL: | |
| case node_libcurl_1.CurlCode.CURLE_PEER_FAILED_VERIFICATION: { | |
| return new errors_1.ServiceBadApproachError(msg); | |
| } | |
| // Retryable errors | |
| case node_libcurl_1.CurlCode.CURLE_REMOTE_ACCESS_DENIED: | |
| case node_libcurl_1.CurlCode.CURLE_SEND_ERROR: | |
| case node_libcurl_1.CurlCode.CURLE_RECV_ERROR: | |
| case node_libcurl_1.CurlCode.CURLE_GOT_NOTHING: | |
| case node_libcurl_1.CurlCode.CURLE_SSL_CONNECT_ERROR: | |
| case node_libcurl_1.CurlCode.CURLE_QUIC_CONNECT_ERROR: | |
| case node_libcurl_1.CurlCode.CURLE_COULDNT_RESOLVE_PROXY: | |
| case node_libcurl_1.CurlCode.CURLE_COULDNT_CONNECT: | |
| case node_libcurl_1.CurlCode.CURLE_PARTIAL_FILE: { | |
| return new errors_1.ServiceBadAttemptError(msg); | |
| } | |
| default: { | |
| return undefined; | |
| } | |
| } | |
| } | |
| }; | |
| exports.CurlControl = CurlControl; | |
| exports.CurlControl = CurlControl = __decorate([ | |
| (0, tsyringe_1.singleton)(), | |
| __metadata("design:paramtypes", [logger_1.GlobalLogger, | |
| temp_file_1.TempFileManager, | |
| async_context_1.AsyncLocalContext, | |
| blackhole_detector_1.BlackHoleDetector]) | |
| ], CurlControl); | |
| //# sourceMappingURL=curl.js.map |