web_reader / src /services /curl.ts
nomagick's picture
fix: saas issus
e6fdd87 unverified
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
import { ScrappingOptions } from './puppeteer';
import { GlobalLogger } from './logger';
import { AssertionFailureError, FancyFile } from 'civkit';
import { ServiceBadAttemptError, ServiceBadApproachError } from './errors';
import { TempFileManager } from '../services/temp-file';
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
import { ZSTDDecompress } from 'simple-zstd';
import _ from 'lodash';
import { Readable } from 'stream';
import { AsyncLocalContext } from './async-context';
import { BlackHoleDetector } from './blackhole-detector';
export interface CURLScrappingOptions extends ScrappingOptions {
method?: string;
body?: string | Buffer;
}
@singleton()
export class CurlControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
chromeVersion: string = `132`;
safariVersion: string = `537.36`;
platform: string = `Linux`;
ua: string = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`;
lifeCycleTrack = new WeakMap();
constructor(
protected globalLogger: GlobalLogger,
protected tempFileManager: TempFileManager,
protected asyncLocalContext: AsyncLocalContext,
protected blackHoleDetector: BlackHoleDetector,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
if (process.platform === 'darwin') {
this.platform = `macOS`;
} else if (process.platform === 'win32') {
this.platform = `Windows`;
}
this.emit('ready');
}
impersonateChrome(ua: string) {
this.chromeVersion = ua.match(/Chrome\/(\d+)/)![1];
this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)![1];
this.ua = ua;
}
curlImpersonateHeader(curl: Curl, headers?: object) {
let uaPlatform = this.platform;
if (this.ua.includes('Windows')) {
uaPlatform = 'Windows';
} else if (this.ua.includes('Android')) {
uaPlatform = 'Android';
} else if (this.ua.includes('iPhone') || this.ua.includes('iPad') || this.ua.includes('iPod')) {
uaPlatform = 'iOS';
} else if (this.ua.includes('CrOS')) {
uaPlatform = 'Chrome OS';
} else if (this.ua.includes('Macintosh')) {
uaPlatform = 'macOS';
}
const mixinHeaders: Record<string, string> = {
'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': `"${uaPlatform}"`,
'Upgrade-Insecure-Requests': '1',
'User-Agent': this.ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'en-US,en;q=0.9',
};
const headersCopy: Record<string, string | undefined> = { ...headers };
for (const k of Object.keys(mixinHeaders)) {
const lowerK = k.toLowerCase();
if (headersCopy[lowerK]) {
mixinHeaders[k] = headersCopy[lowerK];
delete headersCopy[lowerK];
}
}
Object.assign(mixinHeaders, headersCopy);
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => {
if (Array.isArray(v) && v.length) {
return v.map((v2) => `${k}: ${v2}`);
}
return [`${k}: ${v}`];
}));
return curl;
}
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
return new Promise<{
statusCode: number,
statusText?: string,
data?: FancyFile,
headers: HeaderInfo[],
}>((resolve, reject) => {
let contentType = '';
const curl = new Curl();
curl.enable(CurlFeature.StreamResponse);
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, false);
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
curl.setOpt(Curl.option.LOW_SPEED_LIMIT, 32768);
curl.setOpt(Curl.option.LOW_SPEED_TIME, 5_000);
if (crawlOpts?.method) {
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
}
if (crawlOpts?.body) {
curl.setOpt(Curl.option.POSTFIELDS, crawlOpts.body.toString());
}
const headersToSet = { ...crawlOpts?.extraHeaders };
if (crawlOpts?.cookies?.length) {
const cookieKv: Record<string, string> = {};
for (const cookie of crawlOpts.cookies) {
cookieKv[cookie.name] = cookie.value;
}
for (const cookie of crawlOpts.cookies) {
if (cookie.maxAge && cookie.maxAge < 0) {
delete cookieKv[cookie.name];
continue;
}
if (cookie.expires && cookie.expires < new Date()) {
delete cookieKv[cookie.name];
continue;
}
if (cookie.secure && urlToCrawl.protocol !== 'https:') {
delete cookieKv[cookie.name];
continue;
}
if (cookie.domain && !urlToCrawl.hostname.endsWith(cookie.domain)) {
delete cookieKv[cookie.name];
continue;
}
if (cookie.path && !urlToCrawl.pathname.startsWith(cookie.path)) {
delete cookieKv[cookie.name];
continue;
}
}
const cookieChunks = Object.entries(cookieKv).map(([k, v]) => `${k}=${encodeURIComponent(v)}`);
headersToSet.cookie ??= cookieChunks.join('; ');
}
if (crawlOpts?.referer) {
headersToSet.referer ??= crawlOpts.referer;
}
if (crawlOpts?.overrideUserAgent) {
headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent;
}
this.curlImpersonateHeader(curl, headersToSet);
if (crawlOpts?.proxyUrl) {
const proxyUrlCopy = new URL(crawlOpts.proxyUrl);
curl.setOpt(Curl.option.PROXY, proxyUrlCopy.href);
}
let curlStream: Readable | undefined;
curl.on('error', (err, errCode) => {
curl.close();
this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err, urlToCrawl });
const err2 = this.digestCurlCode(errCode, err.message) ||
new AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`);
err2.cause ??= err;
if (curlStream) {
// For some reason, manually emitting error event is required for curlStream.
curlStream.emit('error', err2);
curlStream.destroy(err2);
}
reject(err2);
});
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
let status = -1;
let statusText: string|undefined;
let contentEncoding = '';
curl.once('end', () => {
if (curlStream) {
curlStream.once('end', () => curl.close());
return;
}
curl.close();
});
curl.on('stream', (stream, statusCode, headers) => {
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode });
status = statusCode;
curlStream = stream;
for (const headerSet of (headers as HeaderInfo[])) {
for (const [k, v] of Object.entries(headerSet)) {
if (k.trim().endsWith(':')) {
Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || '');
Reflect.deleteProperty(headerSet, k);
continue;
}
if (v === undefined) {
Reflect.set(headerSet, k, '');
continue;
}
if (k.toLowerCase() === 'content-type' && typeof v === 'string') {
contentType = v.toLowerCase();
}
}
}
const lastResHeaders = headers[headers.length - 1];
statusText = (lastResHeaders as HeaderInfo).result?.reason;
for (const [k, v] of Object.entries(lastResHeaders)) {
const kl = k.toLowerCase();
if (kl === 'content-type') {
contentType = (v || '').toLowerCase();
}
if (kl === 'content-encoding') {
contentEncoding = (v || '').toLowerCase();
}
if (contentType && contentEncoding) {
break;
}
}
if ([301, 302, 303, 307, 308].includes(statusCode)) {
if (stream) {
stream.resume();
}
resolve({
statusCode: status,
statusText,
data: undefined,
headers: headers as HeaderInfo[],
});
return;
}
if (!stream) {
resolve({
statusCode: status,
statusText,
data: undefined,
headers: headers as HeaderInfo[],
});
return;
}
switch (contentEncoding) {
case 'gzip': {
const decompressed = createGunzip();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'deflate': {
const decompressed = createInflate();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'br': {
const decompressed = createBrotliDecompress();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'zstd': {
const decompressed = ZSTDDecompress();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
default: {
break;
}
}
const fpath = this.tempFileManager.alloc();
const fancyFile = FancyFile.auto(stream, fpath);
this.tempFileManager.bindPathTo(fancyFile, fpath);
resolve({
statusCode: status,
statusText,
data: fancyFile,
headers: headers as HeaderInfo[],
});
});
curl.perform();
});
}
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
let leftRedirection = 6;
let cookieRedirects = 0;
let opts = { ...crawlOpts };
let nextHopUrl = urlToCrawl;
const fakeHeaderInfos: HeaderInfo[] = [];
do {
const r = await this.urlToFile1Shot(nextHopUrl, opts);
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
fakeHeaderInfos.push(...r.headers);
const headers = r.headers[r.headers.length - 1];
const location: string | undefined = headers.Location || headers.location;
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
if (setCookieHeader) {
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
if (parsed.length) {
opts.cookies = [...(opts.cookies || []), ...parsed];
}
if (!location) {
cookieRedirects += 1;
}
}
if (!location && !setCookieHeader) {
// Follow curl behavior
return {
statusCode: r.statusCode,
data: r.data,
headers: fakeHeaderInfos.concat(r.headers),
};
}
if (!location && cookieRedirects > 1) {
throw new ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
}
nextHopUrl = new URL(location || '', nextHopUrl);
leftRedirection -= 1;
continue;
}
return {
statusCode: r.statusCode,
statusText: r.statusText,
data: r.data,
headers: fakeHeaderInfos.concat(r.headers),
};
} while (leftRedirection > 0);
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`);
}
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
this.blackHoleDetector.itWorked();
let finalURL = targetUrl;
const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
impersonate: {},
proxyOrigin: {},
};
for (const headers of curlResult.headers) {
sideLoadOpts.impersonate[finalURL.href] = {
status: headers.result?.code || -1,
headers: _.omit(headers, 'result'),
contentType: headers['Content-Type'] || headers['content-type'],
};
if (crawlOpts?.proxyUrl) {
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
}
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
const location = headers.Location || headers.location;
if (location) {
finalURL = new URL(location, finalURL);
}
}
}
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream';
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) {
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
}
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
return {
finalURL,
sideLoadOpts,
chain: curlResult.headers,
status: curlResult.statusCode,
statusText: curlResult.statusText,
headers: lastHeaders,
contentType,
contentDisposition,
fileName,
file: curlResult.data
};
}
digestCurlCode(code: CurlCode, msg: string) {
switch (code) {
// 400 User errors
case CurlCode.CURLE_COULDNT_RESOLVE_HOST: {
return new AssertionFailureError(msg);
}
// Maybe retry but dont retry with curl again
case CurlCode.CURLE_OPERATION_TIMEDOUT:
case CurlCode.CURLE_UNSUPPORTED_PROTOCOL:
case CurlCode.CURLE_PEER_FAILED_VERIFICATION: {
return new ServiceBadApproachError(msg);
}
// Retryable errors
case CurlCode.CURLE_REMOTE_ACCESS_DENIED:
case CurlCode.CURLE_SEND_ERROR:
case CurlCode.CURLE_RECV_ERROR:
case CurlCode.CURLE_GOT_NOTHING:
case CurlCode.CURLE_SSL_CONNECT_ERROR:
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
case CurlCode.CURLE_COULDNT_CONNECT:
case CurlCode.CURLE_PARTIAL_FILE: {
return new ServiceBadAttemptError(msg);
}
default: {
return undefined;
}
}
}
}