Mohammad Shahid
Include pre-built files for HF deployment
f316cce
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.CurlControl = void 0;
const async_service_1 = require("civkit/async-service");
const tsyringe_1 = require("tsyringe");
const node_libcurl_1 = require("node-libcurl");
const set_cookie_parser_1 = require("set-cookie-parser");
const logger_1 = require("./logger");
const civkit_1 = require("civkit");
const errors_1 = require("./errors");
const temp_file_1 = require("../services/temp-file");
const zlib_1 = require("zlib");
const simple_zstd_1 = require("simple-zstd");
const lodash_1 = __importDefault(require("lodash"));
const async_context_1 = require("./async-context");
const blackhole_detector_1 = require("./blackhole-detector");
let CurlControl = class CurlControl extends async_service_1.AsyncService {
constructor(globalLogger, tempFileManager, asyncLocalContext, blackHoleDetector) {
super(...arguments);
this.globalLogger = globalLogger;
this.tempFileManager = tempFileManager;
this.asyncLocalContext = asyncLocalContext;
this.blackHoleDetector = blackHoleDetector;
this.logger = this.globalLogger.child({ service: this.constructor.name });
this.chromeVersion = `132`;
this.safariVersion = `537.36`;
this.platform = `Linux`;
this.ua = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`;
this.lifeCycleTrack = new WeakMap();
}
async init() {
await this.dependencyReady();
if (process.platform === 'darwin') {
this.platform = `macOS`;
}
else if (process.platform === 'win32') {
this.platform = `Windows`;
}
this.emit('ready');
}
impersonateChrome(ua) {
this.chromeVersion = ua.match(/Chrome\/(\d+)/)[1];
this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)[1];
this.ua = ua;
}
curlImpersonateHeader(curl, headers) {
let uaPlatform = this.platform;
if (this.ua.includes('Windows')) {
uaPlatform = 'Windows';
}
else if (this.ua.includes('Android')) {
uaPlatform = 'Android';
}
else if (this.ua.includes('iPhone') || this.ua.includes('iPad') || this.ua.includes('iPod')) {
uaPlatform = 'iOS';
}
else if (this.ua.includes('CrOS')) {
uaPlatform = 'Chrome OS';
}
else if (this.ua.includes('Macintosh')) {
uaPlatform = 'macOS';
}
const mixinHeaders = {
'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': `"${uaPlatform}"`,
'Upgrade-Insecure-Requests': '1',
'User-Agent': this.ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'en-US,en;q=0.9',
};
const headersCopy = { ...headers };
for (const k of Object.keys(mixinHeaders)) {
const lowerK = k.toLowerCase();
if (headersCopy[lowerK]) {
mixinHeaders[k] = headersCopy[lowerK];
delete headersCopy[lowerK];
}
}
Object.assign(mixinHeaders, headersCopy);
curl.setOpt(node_libcurl_1.Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => {
if (Array.isArray(v) && v.length) {
return v.map((v2) => `${k}: ${v2}`);
}
return [`${k}: ${v}`];
}));
return curl;
}
urlToFile1Shot(urlToCrawl, crawlOpts) {
return new Promise((resolve, reject) => {
let contentType = '';
const curl = new node_libcurl_1.Curl();
curl.enable(node_libcurl_1.CurlFeature.StreamResponse);
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(node_libcurl_1.Curl.option.FOLLOWLOCATION, false);
curl.setOpt(node_libcurl_1.Curl.option.SSL_VERIFYPEER, false);
curl.setOpt(node_libcurl_1.Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
curl.setOpt(node_libcurl_1.Curl.option.CONNECTTIMEOUT_MS, 3_000);
curl.setOpt(node_libcurl_1.Curl.option.LOW_SPEED_LIMIT, 32768);
curl.setOpt(node_libcurl_1.Curl.option.LOW_SPEED_TIME, 5_000);
if (crawlOpts?.method) {
curl.setOpt(node_libcurl_1.Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
}
if (crawlOpts?.body) {
curl.setOpt(node_libcurl_1.Curl.option.POSTFIELDS, crawlOpts.body.toString());
}
const headersToSet = { ...crawlOpts?.extraHeaders };
if (crawlOpts?.cookies?.length) {
const cookieKv = {};
for (const cookie of crawlOpts.cookies) {
cookieKv[cookie.name] = cookie.value;
}
for (const cookie of crawlOpts.cookies) {
if (cookie.maxAge && cookie.maxAge < 0) {
delete cookieKv[cookie.name];
continue;
}
if (cookie.expires && cookie.expires < new Date()) {
delete cookieKv[cookie.name];
continue;
}
if (cookie.secure && urlToCrawl.protocol !== 'https:') {
delete cookieKv[cookie.name];
continue;
}
if (cookie.domain && !urlToCrawl.hostname.endsWith(cookie.domain)) {
delete cookieKv[cookie.name];
continue;
}
if (cookie.path && !urlToCrawl.pathname.startsWith(cookie.path)) {
delete cookieKv[cookie.name];
continue;
}
}
const cookieChunks = Object.entries(cookieKv).map(([k, v]) => `${k}=${encodeURIComponent(v)}`);
headersToSet.cookie ??= cookieChunks.join('; ');
}
if (crawlOpts?.referer) {
headersToSet.referer ??= crawlOpts.referer;
}
if (crawlOpts?.overrideUserAgent) {
headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent;
}
this.curlImpersonateHeader(curl, headersToSet);
if (crawlOpts?.proxyUrl) {
const proxyUrlCopy = new URL(crawlOpts.proxyUrl);
curl.setOpt(node_libcurl_1.Curl.option.PROXY, proxyUrlCopy.href);
}
let curlStream;
curl.on('error', (err, errCode) => {
curl.close();
this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err, urlToCrawl });
const err2 = this.digestCurlCode(errCode, err.message) ||
new civkit_1.AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`);
err2.cause ??= err;
if (curlStream) {
// For some reason, manually emitting error event is required for curlStream.
curlStream.emit('error', err2);
curlStream.destroy(err2);
}
reject(err2);
});
curl.setOpt(node_libcurl_1.Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
let status = -1;
let statusText;
let contentEncoding = '';
curl.once('end', () => {
if (curlStream) {
curlStream.once('end', () => curl.close());
return;
}
curl.close();
});
curl.on('stream', (stream, statusCode, headers) => {
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode });
status = statusCode;
curlStream = stream;
for (const headerSet of headers) {
for (const [k, v] of Object.entries(headerSet)) {
if (k.trim().endsWith(':')) {
Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || '');
Reflect.deleteProperty(headerSet, k);
continue;
}
if (v === undefined) {
Reflect.set(headerSet, k, '');
continue;
}
if (k.toLowerCase() === 'content-type' && typeof v === 'string') {
contentType = v.toLowerCase();
}
}
}
const lastResHeaders = headers[headers.length - 1];
statusText = lastResHeaders.result?.reason;
for (const [k, v] of Object.entries(lastResHeaders)) {
const kl = k.toLowerCase();
if (kl === 'content-type') {
contentType = (v || '').toLowerCase();
}
if (kl === 'content-encoding') {
contentEncoding = (v || '').toLowerCase();
}
if (contentType && contentEncoding) {
break;
}
}
if ([301, 302, 303, 307, 308].includes(statusCode)) {
if (stream) {
stream.resume();
}
resolve({
statusCode: status,
statusText,
data: undefined,
headers: headers,
});
return;
}
if (!stream) {
resolve({
statusCode: status,
statusText,
data: undefined,
headers: headers,
});
return;
}
switch (contentEncoding) {
case 'gzip': {
const decompressed = (0, zlib_1.createGunzip)();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'deflate': {
const decompressed = (0, zlib_1.createInflate)();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'br': {
const decompressed = (0, zlib_1.createBrotliDecompress)();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'zstd': {
const decompressed = (0, simple_zstd_1.ZSTDDecompress)();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
default: {
break;
}
}
const fpath = this.tempFileManager.alloc();
const fancyFile = civkit_1.FancyFile.auto(stream, fpath);
this.tempFileManager.bindPathTo(fancyFile, fpath);
resolve({
statusCode: status,
statusText,
data: fancyFile,
headers: headers,
});
});
curl.perform();
});
}
async urlToFile(urlToCrawl, crawlOpts) {
let leftRedirection = 6;
let cookieRedirects = 0;
let opts = { ...crawlOpts };
let nextHopUrl = urlToCrawl;
const fakeHeaderInfos = [];
do {
const r = await this.urlToFile1Shot(nextHopUrl, opts);
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
fakeHeaderInfos.push(...r.headers);
const headers = r.headers[r.headers.length - 1];
const location = headers.Location || headers.location;
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
if (setCookieHeader) {
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
const parsed = cookieAssignments.filter(Boolean).map((x) => (0, set_cookie_parser_1.parseString)(x, { decodeValues: true }));
if (parsed.length) {
opts.cookies = [...(opts.cookies || []), ...parsed];
}
if (!location) {
cookieRedirects += 1;
}
}
if (!location && !setCookieHeader) {
// Follow curl behavior
return {
statusCode: r.statusCode,
data: r.data,
headers: fakeHeaderInfos.concat(r.headers),
};
}
if (!location && cookieRedirects > 1) {
throw new errors_1.ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
}
nextHopUrl = new URL(location || '', nextHopUrl);
leftRedirection -= 1;
continue;
}
return {
statusCode: r.statusCode,
statusText: r.statusText,
data: r.data,
headers: fakeHeaderInfos.concat(r.headers),
};
} while (leftRedirection > 0);
throw new errors_1.ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`);
}
async sideLoad(targetUrl, crawlOpts) {
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
this.blackHoleDetector.itWorked();
let finalURL = targetUrl;
const sideLoadOpts = {
impersonate: {},
proxyOrigin: {},
};
for (const headers of curlResult.headers) {
sideLoadOpts.impersonate[finalURL.href] = {
status: headers.result?.code || -1,
headers: lodash_1.default.omit(headers, 'result'),
contentType: headers['Content-Type'] || headers['content-type'],
};
if (crawlOpts?.proxyUrl) {
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
}
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
const location = headers.Location || headers.location;
if (location) {
finalURL = new URL(location, finalURL);
}
}
}
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream';
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) {
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
}
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
return {
finalURL,
sideLoadOpts,
chain: curlResult.headers,
status: curlResult.statusCode,
statusText: curlResult.statusText,
headers: lastHeaders,
contentType,
contentDisposition,
fileName,
file: curlResult.data
};
}
digestCurlCode(code, msg) {
switch (code) {
// 400 User errors
case node_libcurl_1.CurlCode.CURLE_COULDNT_RESOLVE_HOST: {
return new civkit_1.AssertionFailureError(msg);
}
// Maybe retry but dont retry with curl again
case node_libcurl_1.CurlCode.CURLE_OPERATION_TIMEDOUT:
case node_libcurl_1.CurlCode.CURLE_UNSUPPORTED_PROTOCOL:
case node_libcurl_1.CurlCode.CURLE_PEER_FAILED_VERIFICATION: {
return new errors_1.ServiceBadApproachError(msg);
}
// Retryable errors
case node_libcurl_1.CurlCode.CURLE_REMOTE_ACCESS_DENIED:
case node_libcurl_1.CurlCode.CURLE_SEND_ERROR:
case node_libcurl_1.CurlCode.CURLE_RECV_ERROR:
case node_libcurl_1.CurlCode.CURLE_GOT_NOTHING:
case node_libcurl_1.CurlCode.CURLE_SSL_CONNECT_ERROR:
case node_libcurl_1.CurlCode.CURLE_QUIC_CONNECT_ERROR:
case node_libcurl_1.CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
case node_libcurl_1.CurlCode.CURLE_COULDNT_CONNECT:
case node_libcurl_1.CurlCode.CURLE_PARTIAL_FILE: {
return new errors_1.ServiceBadAttemptError(msg);
}
default: {
return undefined;
}
}
}
};
exports.CurlControl = CurlControl;
exports.CurlControl = CurlControl = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [logger_1.GlobalLogger,
temp_file_1.TempFileManager,
async_context_1.AsyncLocalContext,
blackhole_detector_1.BlackHoleDetector])
], CurlControl);
//# sourceMappingURL=curl.js.map