web_reader / build /services /puppeteer.js
Mohammad Shahid
Include pre-built files for HF deployment
f316cce
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.PuppeteerControl = void 0;
const lodash_1 = __importDefault(require("lodash"));
const net_1 = require("net");
const promises_1 = require("fs/promises");
const fs_1 = __importDefault(require("fs"));
const tsyringe_1 = require("tsyringe");
const puppeteer_1 = __importStar(require("puppeteer"));
const defer_1 = require("civkit/defer");
const civ_rpc_1 = require("civkit/civ-rpc");
const async_service_1 = require("civkit/async-service");
const timeout_1 = require("civkit/timeout");
const errors_1 = require("../shared/lib/errors");
const curl_1 = require("./curl");
const blackhole_detector_1 = require("./blackhole-detector");
const async_context_1 = require("./async-context");
const logger_1 = require("./logger");
const minimal_stealth_1 = require("./minimal-stealth");
const tldExtract = require('tld-extract');
const READABILITY_JS = fs_1.default.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
const SIMULATE_SCROLL = `
(function () {
function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
const targetRect = target.getBoundingClientRect();
const record = {
target,
isIntersecting,
time: timestamp,
// If intersecting, intersectionRect matches boundingClientRect
// If not intersecting, intersectionRect is empty (0x0)
intersectionRect: isIntersecting
? targetRect
: new DOMRectReadOnly(0, 0, 0, 0),
// Current bounding client rect of the target
boundingClientRect: targetRect,
// Intersection ratio is either 0 (not intersecting) or 1 (fully intersecting)
intersectionRatio: isIntersecting ? 1 : 0,
// Root bounds (viewport in our case)
rootBounds: new DOMRectReadOnly(
0,
0,
window.innerWidth,
window.innerHeight
)
};
Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
return record;
}
function cloneIntersectionObserverEntry(entry) {
const record = {
target: entry.target,
isIntersecting: entry.isIntersecting,
time: entry.time,
intersectionRect: entry.intersectionRect,
boundingClientRect: entry.boundingClientRect,
intersectionRatio: entry.intersectionRatio,
rootBounds: entry.rootBounds
};
Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
return record;
}
const orig = window.IntersectionObserver;
const kCallback = Symbol('callback');
const kLastEntryMap = Symbol('lastEntryMap');
const liveObservers = new Map();
class MangledIntersectionObserver extends orig {
constructor(callback, options) {
super((entries, observer) => {
const lastEntryMap = observer[kLastEntryMap];
const lastEntry = entries[entries.length - 1];
lastEntryMap.set(lastEntry.target, lastEntry);
return callback(entries, observer);
}, options);
this[kCallback] = callback;
this[kLastEntryMap] = new WeakMap();
liveObservers.set(this, new Set());
}
disconnect() {
liveObservers.get(this)?.clear();
liveObservers.delete(this);
return super.disconnect();
}
observe(target) {
const observer = liveObservers.get(this);
observer?.add(target);
return super.observe(target);
}
unobserve(target) {
const observer = liveObservers.get(this);
observer?.delete(target);
return super.unobserve(target);
}
}
Object.defineProperty(MangledIntersectionObserver, 'name', { value: 'IntersectionObserver', writable: false });
window.IntersectionObserver = MangledIntersectionObserver;
function simulateScroll() {
for (const [observer, targets] of liveObservers.entries()) {
const t0 = performance.now();
for (const target of targets) {
const entry = createIntersectionObserverEntry(target, true, t0);
observer[kCallback]([entry], observer);
setTimeout(() => {
const t1 = performance.now();
const lastEntry = observer[kLastEntryMap].get(target);
if (!lastEntry) {
return;
}
const entry2 = { ...cloneIntersectionObserverEntry(lastEntry), time: t1 };
observer[kCallback]([entry2], observer);
});
}
}
}
window.simulateScroll = simulateScroll;
})();
`;
const MUTATION_IDLE_WATCH = `
(function () {
let timeout;
const sendMsg = ()=> {
document.dispatchEvent(new CustomEvent('mutationIdle'));
};
const cb = () => {
if (timeout) {
clearTimeout(timeout);
timeout = setTimeout(sendMsg, 200);
}
};
const mutationObserver = new MutationObserver(cb);
document.addEventListener('DOMContentLoaded', () => {
mutationObserver.observe(document.documentElement, {
childList: true,
subtree: true,
});
timeout = setTimeout(sendMsg, 200);
}, { once: true })
})();
`;
const SCRIPT_TO_INJECT_INTO_FRAME = `
${READABILITY_JS}
${SIMULATE_SCROLL}
${MUTATION_IDLE_WATCH}
(${minimal_stealth_1.minimalStealth.toString()})();
(function(){
function briefImgs(elem) {
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
return imageTags.map((x)=> {
let linkPreferredSrc = x.src;
if (linkPreferredSrc.startsWith('data:')) {
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
linkPreferredSrc = x.dataset.src;
}
}
return {
src: new URL(linkPreferredSrc, document.baseURI).toString(),
loaded: x.complete,
width: x.width,
height: x.height,
naturalWidth: x.naturalWidth,
naturalHeight: x.naturalHeight,
alt: x.alt || x.title,
};
});
}
function getMaxDepthAndElemCountUsingTreeWalker(root=document.documentElement) {
let maxDepth = 0;
let currentDepth = 0;
let elementCount = 0;
const treeWalker = document.createTreeWalker(
root,
NodeFilter.SHOW_ELEMENT,
(node) => {
const nodeName = node.nodeName?.toLowerCase();
return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
},
false
);
while (true) {
maxDepth = Math.max(maxDepth, currentDepth);
elementCount++; // Increment the count for the current node
if (treeWalker.firstChild()) {
currentDepth++;
} else {
while (!treeWalker.nextSibling() && currentDepth > 0) {
treeWalker.parentNode();
currentDepth--;
}
if (currentDepth <= 0) {
break;
}
}
}
return {
maxDepth: maxDepth + 1,
elementCount: elementCount
};
}
function cloneAndExpandShadowRoots(rootElement = document.documentElement) {
// Create a shallow clone of the root element
const clone = rootElement.cloneNode(false);
// Function to process an element and its shadow root
function processShadowRoot(original, cloned) {
if (original.shadowRoot && original.shadowRoot.mode === 'open') {
shadowDomPresents = true;
const shadowContent = document.createDocumentFragment();
// Clone shadow root content normally
original.shadowRoot.childNodes.forEach(childNode => {
const clonedNode = childNode.cloneNode(true);
shadowContent.appendChild(clonedNode);
});
// Handle slots
const slots = shadowContent.querySelectorAll('slot');
slots.forEach(slot => {
const slotName = slot.getAttribute('name') || '';
const assignedElements = original.querySelectorAll(
slotName ? \`[slot="\${slotName}"]\` : ':not([slot])'
);
if (assignedElements.length > 0) {
const slotContent = document.createDocumentFragment();
assignedElements.forEach(el => {
const clonedEl = el.cloneNode(true);
slotContent.appendChild(clonedEl);
});
slot.parentNode.replaceChild(slotContent, slot);
} else if (!slotName) {
// Keep default slot content
// No need to do anything as it's already cloned
}
});
cloned.appendChild(shadowContent);
}
}
// Use a TreeWalker on the original root to clone the entire structure
const treeWalker = document.createTreeWalker(
rootElement,
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT
);
const elementMap = new Map([[rootElement, clone]]);
let currentNode;
while (currentNode = treeWalker.nextNode()) {
const parentClone = elementMap.get(currentNode.parentNode);
const clonedNode = currentNode.cloneNode(false);
parentClone.appendChild(clonedNode);
if (currentNode.nodeType === Node.ELEMENT_NODE) {
elementMap.set(currentNode, clonedNode);
processShadowRoot(currentNode, clonedNode);
}
}
return clone;
}
function shadowDomPresent(rootElement = document.documentElement) {
const elems = rootElement.querySelectorAll('*');
for (const x of elems) {
if (x.shadowRoot && x.shadowRoot.mode === 'open') {
return true;
}
}
return false;
}
let lastMutationIdle = 0;
let initialAnalytics;
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
if (stopActiveSnapshot) {
window.haltSnapshot = true;
}
let parsed;
try {
parsed = new Readability(document.cloneNode(true)).parse();
} catch (err) {
void 0;
}
const domAnalysis = overrideDomAnalysis || getMaxDepthAndElemCountUsingTreeWalker(document.documentElement);
initialAnalytics ??= domAnalysis;
const thisElemCount = domAnalysis.elementCount;
const initialElemCount = initialAnalytics.elementCount;
Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON)
const r = {
title: document.title,
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
href: document.location.href,
html: document.documentElement?.outerHTML,
htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.05),
text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed,
imgs: [],
maxElemDepth: domAnalysis.maxDepth,
elemCount: domAnalysis.elementCount,
lastMutationIdle,
};
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}
r.imgs = briefImgs();
return r;
}
function waitForSelector(selectorText) {
return new Promise((resolve) => {
const existing = document.querySelector(selectorText);
if (existing) {
resolve(existing);
return;
}
const observer = new MutationObserver(() => {
const elem = document.querySelector(selectorText);
if (elem) {
resolve(document.querySelector(selectorText));
observer.disconnect();
}
});
observer.observe(document.documentElement, {
childList: true,
subtree: true
});
});
}
window.getMaxDepthAndElemCountUsingTreeWalker = getMaxDepthAndElemCountUsingTreeWalker;
window.waitForSelector = waitForSelector;
window.giveSnapshot = giveSnapshot;
window.briefImgs = briefImgs;
})();
`;
const documentResourceTypes = new Set([
'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight'
]);
const mediaResourceTypes = new Set([
'stylesheet', 'image', 'font', 'media'
]);
class PageReqCtrlKit {
constructor(concurrency) {
this.concurrency = concurrency;
this.reqSet = new Set();
this.blockers = [];
this.lastResourceLoadedAt = 0;
this.lastContentResourceLoadedAt = 0;
this.lastMediaResourceLoadedAt = 0;
if (isNaN(concurrency) || concurrency < 1) {
throw new civ_rpc_1.AssertionFailureError(`Invalid concurrency: ${concurrency}`);
}
}
onNewRequest(req) {
this.reqSet.add(req);
if (this.reqSet.size <= this.concurrency) {
return Promise.resolve();
}
const deferred = (0, defer_1.Defer)();
this.blockers.push(deferred);
return deferred.promise;
}
onFinishRequest(req) {
this.reqSet.delete(req);
const deferred = this.blockers.shift();
deferred?.resolve();
const now = Date.now();
this.lastResourceLoadedAt = now;
// Beware req being undefined
// https://pptr.dev/api/puppeteer.pageevent#:~:text=For%20certain%20requests%2C%20might%20contain%20undefined.
const typ = req?.resourceType();
if (!typ) {
return;
}
if (documentResourceTypes.has(typ)) {
this.lastContentResourceLoadedAt = now;
}
if (mediaResourceTypes.has(typ)) {
this.lastMediaResourceLoadedAt = now;
}
}
}
let PuppeteerControl = class PuppeteerControl extends async_service_1.AsyncService {
constructor(globalLogger, asyncLocalContext, curlControl, blackHoleDetector) {
super(...arguments);
this.globalLogger = globalLogger;
this.asyncLocalContext = asyncLocalContext;
this.curlControl = curlControl;
this.blackHoleDetector = blackHoleDetector;
this._sn = 0;
this.logger = this.globalLogger.child({ service: this.constructor.name });
this.__loadedPage = [];
this.finalizerMap = new WeakMap();
this.snMap = new WeakMap();
this.livePages = new Set();
this.pagePhase = new WeakMap();
this.lastPageCratedAt = 0;
this.ua = '';
this.effectiveUA = '';
this.concurrentRequestsPerPage = 32;
this.pageReqCtrl = new WeakMap();
this.lastReqSentAt = 0;
this.circuitBreakerHosts = new Set();
this.lifeCycleTrack = new WeakMap();
this.setMaxListeners(Infinity);
let crippledTimes = 0;
this.on('crippled', () => {
crippledTimes += 1;
this.__loadedPage.length = 0;
this.livePages.clear();
if (crippledTimes > 5) {
process.nextTick(() => {
this.emit('error', new Error('Browser crashed too many times, quitting...'));
// process.exit(1);
});
}
});
}
async init() {
await this.dependencyReady();
if (process.env.NODE_ENV?.includes('dry-run')) {
this.emit('ready');
return;
}
if (this.browser) {
if (this.browser.connected) {
await this.browser.close();
}
else {
this.browser.process()?.kill('SIGKILL');
}
}
this.browser = await puppeteer_1.default.launch({
timeout: 10_000,
headless: !Boolean(process.env.DEBUG_BROWSER),
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
args: [
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
]
}).catch((err) => {
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
process.nextTick(() => {
this.emit('error', err);
// process.exit(1);
});
return Promise.reject(err);
});
this.browser.once('disconnected', () => {
this.logger.warn(`Browser disconnected`);
if (this.browser) {
this.emit('crippled');
}
process.nextTick(() => this.serviceReady());
});
this.ua = await this.browser.userAgent();
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
this.curlControl.impersonateChrome(this.effectiveUA);
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
this.emit('ready');
}
getRpsControlKit(page) {
let kit = this.pageReqCtrl.get(page);
if (!kit) {
kit = new PageReqCtrlKit(this.concurrentRequestsPerPage);
this.pageReqCtrl.set(page, kit);
}
return kit;
}
async newPage(bewareDeadLock = false) {
if (!bewareDeadLock) {
await this.serviceReady();
}
const sn = this._sn++;
let page;
try {
const dedicatedContext = await this.browser.createBrowserContext();
page = await dedicatedContext.newPage();
}
catch (err) {
this.logger.warn(`Failed to create page ${sn}`, { err });
this.browser.process()?.kill('SIGKILL');
throw new errors_1.ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`);
}
const preparations = [];
preparations.push(page.setUserAgent(this.effectiveUA));
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1024, height: 1024 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot) => {
if (snapshot.href === 'about:blank') {
return;
}
page.emit('snapshot', snapshot);
}));
preparations.push(page.exposeFunction('setViewport', (viewport) => {
page.setViewport(viewport).catch(() => undefined);
}));
preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME));
preparations.push(page.setRequestInterception(true));
await Promise.all(preparations);
await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
const domainSet = new Set();
let reqCounter = 0;
let t0;
let halt = false;
page.on('request', async (req) => {
reqCounter++;
if (halt) {
return req.abort('blockedbyclient', 1000);
}
const requestUrl = req.url();
if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
return req.abort('blockedbyclient', 1000);
}
t0 ??= Date.now();
const parsedUrl = new URL(requestUrl);
if ((0, net_1.isIP)(parsedUrl.hostname)) {
domainSet.add(parsedUrl.hostname);
}
else {
try {
const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain);
}
catch (_err) {
domainSet.add(parsedUrl.hostname);
}
}
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
return req.abort('blockedbyclient', 1000);
}
if (parsedUrl.hostname === 'localhost' ||
parsedUrl.hostname.startsWith('127.')) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` });
return req.abort('blockedbyclient', 1000);
}
const dt = Math.ceil((Date.now() - t0) / 1000);
const rps = reqCounter / dt;
// console.log(`rps: ${rps}`);
const pagePhase = this.pagePhase.get(page);
if (pagePhase === 'background') {
if (rps > 10 || reqCounter > 1000) {
halt = true;
return req.abort('blockedbyclient', 1000);
}
}
if (reqCounter > 1000) {
if (rps > 60 || reqCounter > 2000) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` });
halt = true;
return req.abort('blockedbyclient', 1000);
}
}
if (domainSet.size > 200) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains` });
halt = true;
return req.abort('blockedbyclient', 1000);
}
if (requestUrl.startsWith('http')) {
const kit = this.getRpsControlKit(page);
await kit.onNewRequest(req);
}
if (req.isInterceptResolutionHandled()) {
return;
}
;
const continueArgs = req.continueRequestOverrides
? [req.continueRequestOverrides(), 0]
: [];
return req.continue(continueArgs[0], continueArgs[1]);
});
const reqFinishHandler = (req) => {
const kit = this.getRpsControlKit(page);
kit.onFinishRequest(req);
};
page.on('requestfinished', reqFinishHandler);
page.on('requestfailed', reqFinishHandler);
page.on('requestservedfromcache', reqFinishHandler);
await page.evaluateOnNewDocument(`
(function () {
if (window.self === window.top) {
let lastAnalytics;
let lastReportedAt = 0;
const handlePageLoad = () => {
const now = Date.now();
const dt = now - lastReportedAt;
const previousAnalytics = lastAnalytics;
const thisAnalytics = getMaxDepthAndElemCountUsingTreeWalker();
let dElem = 0;
if (window.haltSnapshot) {
return;
}
const thisElemCount = thisAnalytics.elementCount;
if (previousAnalytics) {
const previousElemCount = previousAnalytics.elementCount;
const delta = Math.abs(thisElemCount - previousElemCount);
dElem = delta /(previousElemCount + Number.EPSILON);
}
if (dt < 1200 && dElem < 0.05) {
return;
}
lastAnalytics = thisAnalytics;
lastReportedAt = now;
const r = giveSnapshot(false, lastAnalytics);
window.reportSnapshot(r);
};
document.addEventListener('readystatechange', ()=> {
if (document.readyState === 'interactive') {
handlePageLoad();
}
});
document.addEventListener('load', handlePageLoad);
window.addEventListener('load', handlePageLoad);
document.addEventListener('DOMContentLoaded', handlePageLoad);
document.addEventListener('mutationIdle', handlePageLoad);
}
document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
})();
`);
this.snMap.set(page, sn);
this.logger.debug(`Page ${sn} created.`);
this.lastPageCratedAt = Date.now();
this.livePages.add(page);
this.pagePhase.set(page, 'idle');
return page;
}
async getNextPage() {
let thePage;
if (this.__loadedPage.length) {
thePage = this.__loadedPage.shift();
if (this.__loadedPage.length <= 1) {
process.nextTick(() => {
this.newPage()
.then((r) => this.__loadedPage.push(r))
.catch((err) => {
this.logger.warn(`Failed to load new page ahead of time`, { err });
});
});
}
}
if (!thePage) {
thePage = await this.newPage();
}
const timer = setTimeout(() => {
this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage)}...`);
this.ditchPage(thePage);
}, 300 * 1000);
this.finalizerMap.set(thePage, timer);
return thePage;
}
async ditchPage(page) {
if (this.finalizerMap.has(page)) {
clearTimeout(this.finalizerMap.get(page));
this.finalizerMap.delete(page);
}
if (page.isClosed()) {
return;
}
const sn = this.snMap.get(page);
this.logger.debug(`Closing page ${sn}`);
await Promise.race([
(async () => {
const ctx = page.browserContext();
try {
await page.close();
}
finally {
await ctx.close();
}
})(),
(0, timeout_1.delay)(5000)
]).catch((err) => {
this.logger.error(`Failed to destroy page ${sn}`, { err });
});
this.livePages.delete(page);
this.pagePhase.delete(page);
}
async *scrap(parsedUrl, options = {}) {
// parsedUrl.search = '';
const url = parsedUrl.toString();
let snapshot;
let screenshot;
let pageshot;
const pdfUrls = [];
let navigationResponse;
const page = await this.getNextPage();
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
this.pagePhase.set(page, 'active');
page.on('response', (resp) => {
this.blackHoleDetector.itWorked();
const req = resp.request();
if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
navigationResponse = resp;
}
if (!resp.ok()) {
return;
}
const headers = resp.headers();
const url = resp.url();
const contentType = headers['content-type'];
if (contentType?.toLowerCase().includes('application/pdf')) {
pdfUrls.push(url);
}
});
page.on('request', async (req) => {
if (req.isInterceptResolutionHandled()) {
return;
}
;
const reqUrlParsed = new URL(req.url());
if (!reqUrlParsed.protocol.startsWith('http')) {
const overrides = req.continueRequestOverrides();
return req.continue(overrides, 0);
}
const typ = req.resourceType();
if (typ === 'media') {
// Non-cooperative answer to block all media requests.
return req.abort('blockedbyclient');
}
if (!options.proxyResources) {
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
if (!isDocRequest) {
if (options.extraHeaders) {
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...req.headers(),
...overrides?.headers,
...options.extraHeaders,
}
}, 1];
return req.continue(continueArgs[0], continueArgs[1]);
}
const overrides = req.continueRequestOverrides();
return req.continue(overrides, 0);
}
}
const sideload = options.sideLoad;
const impersonate = sideload?.impersonate[reqUrlParsed.href];
if (impersonate) {
let body;
if (impersonate.body) {
body = await (0, promises_1.readFile)(await impersonate.body.filePath);
if (req.isInterceptResolutionHandled()) {
return;
}
}
return req.respond({
status: impersonate.status,
headers: impersonate.headers,
contentType: impersonate.contentType,
body: body ? Uint8Array.from(body) : undefined,
}, 999);
}
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
const ctx = this.lifeCycleTrack.get(page);
if (proxy && ctx) {
return await this.asyncLocalContext.bridge(ctx, async () => {
try {
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
...options,
method: req.method(),
body: req.postData(),
extraHeaders: {
...req.headers(),
...options.extraHeaders,
},
proxyUrl: proxy
});
if (req.isInterceptResolutionHandled()) {
return;
}
;
if (curled.chain.length === 1) {
if (!curled.file) {
return req.respond({
status: curled.status,
headers: lodash_1.default.omit(curled.headers, 'result'),
contentType: curled.contentType,
}, 3);
}
const body = await (0, promises_1.readFile)(await curled.file.filePath);
if (req.isInterceptResolutionHandled()) {
return;
}
;
return req.respond({
status: curled.status,
headers: lodash_1.default.omit(curled.headers, 'result'),
contentType: curled.contentType,
body: Uint8Array.from(body),
}, 3);
}
options.sideLoad ??= curled.sideLoadOpts;
lodash_1.default.merge(options.sideLoad, curled.sideLoadOpts);
const firstReq = curled.chain[0];
return req.respond({
status: firstReq.result.code,
headers: lodash_1.default.omit(firstReq, 'result'),
}, 3);
}
catch (err) {
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
}
if (req.isInterceptResolutionHandled()) {
return;
}
;
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...req.headers(),
...overrides?.headers,
...options.extraHeaders,
}
}, 1];
return req.continue(continueArgs[0], continueArgs[1]);
});
}
if (req.isInterceptResolutionHandled()) {
return;
}
;
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...req.headers(),
...overrides?.headers,
...options.extraHeaders,
}
}, 1];
return req.continue(continueArgs[0], continueArgs[1]);
});
let pageScriptEvaluations = [];
let frameScriptEvaluations = [];
if (options.injectPageScripts?.length) {
page.on('framenavigated', (frame) => {
if (frame !== page.mainFrame()) {
return;
}
pageScriptEvaluations.push(Promise.allSettled(options.injectPageScripts.map((x) => frame.evaluate(x).catch((err) => {
this.logger.warn(`Error in evaluation of page scripts`, { err });
}))));
});
}
if (options.injectFrameScripts?.length) {
page.on('framenavigated', (frame) => {
frameScriptEvaluations.push(Promise.allSettled(options.injectFrameScripts.map((x) => frame.evaluate(x).catch((err) => {
this.logger.warn(`Error in evaluation of frame scripts`, { err });
}))));
});
}
const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
if (options.locale) {
// Add headers via request interception to walk around this bug
// https://github.com/puppeteer/puppeteer/issues/10235
// await page.setExtraHTTPHeaders({
// 'Accept-Language': options.locale
// });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "language", {
get: function () {
return options.locale;
}
});
Object.defineProperty(navigator, "languages", {
get: function () {
return [options.locale];
}
});
});
}
if (options.cookies) {
const mapped = options.cookies.map((x) => {
const draft = {
name: x.name,
value: encodeURIComponent(x.value),
secure: x.secure,
domain: x.domain,
path: x.path,
expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
sameSite: x.sameSite,
};
if (!draft.expires && x.maxAge) {
draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
}
if (!draft.domain) {
draft.url = parsedUrl.toString();
}
return draft;
});
try {
await page.setCookie(...mapped);
}
catch (err) {
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
throw new civ_rpc_1.ParamValidationError({
path: 'cookies',
message: `Failed to set cookies: ${err?.message}`
});
}
}
if (options.overrideUserAgent) {
await page.setUserAgent(options.overrideUserAgent);
}
if (options.viewport) {
await page.setViewport(options.viewport);
}
let nextSnapshotDeferred = (0, defer_1.Defer)();
const crippleListener = () => nextSnapshotDeferred.reject(new errors_1.ServiceCrashedError({ message: `Browser crashed, try again` }));
this.once('crippled', crippleListener);
nextSnapshotDeferred.promise.finally(() => {
this.off('crippled', crippleListener);
});
let successfullyDone;
const hdl = (s) => {
if (snapshot === s) {
return;
}
snapshot = s;
if (snapshot) {
const kit = this.pageReqCtrl.get(page);
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
}
if (s?.maxElemDepth && s.maxElemDepth > 256) {
return;
}
if (s?.elemCount && s.elemCount > 10_000) {
return;
}
nextSnapshotDeferred.resolve(s);
nextSnapshotDeferred = (0, defer_1.Defer)();
this.once('crippled', crippleListener);
nextSnapshotDeferred.promise.finally(() => {
this.off('crippled', crippleListener);
});
};
page.on('snapshot', hdl);
page.once('abuse', (event) => {
this.emit('abuse', { ...event, url: parsedUrl });
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
this.emit('abuse', { ...event, url: snapshot.href });
}
nextSnapshotDeferred.reject(new errors_1.SecurityCompromiseError(`Abuse detected: ${event.reason}`));
});
const timeout = options.timeoutMs || 30_000;
const goToOptions = {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout,
};
if (options.referer) {
goToOptions.referer = options.referer;
}
let waitForPromise;
let finalizationPromise;
const doFinalization = async () => {
if (waitForPromise) {
// SuccessfullyDone is meant for the finish of the page.
// It doesn't matter if you are expecting something and it didn't show up.
await waitForPromise.catch(() => void 0);
}
successfullyDone ??= true;
try {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)');
screenshot = (await this.takeScreenShot(page)) || screenshot;
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
}
catch (err) {
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
}
if (!snapshot?.html) {
return;
}
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit('crawled', {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot,
}, { ...options, url: parsedUrl });
};
const delayPromise = (0, timeout_1.delay)(timeout);
const gotoPromise = page.goto(url, goToOptions)
.catch((err) => {
if (err instanceof puppeteer_1.TimeoutError) {
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
return new civ_rpc_1.AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
});
}
if (err?.message?.startsWith('net::ERR_ABORTED')) {
if (pdfUrls.length) {
// Not throw for pdf mode.
return;
}
}
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
return new civ_rpc_1.AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
});
}).then(async (stuff) => {
// This check is necessary because without snapshot, the condition of the page is unclear
// Calling evaluate directly may stall the process.
if (!snapshot) {
if (stuff instanceof Error) {
throw stuff;
}
}
await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
.catch(() => void 0);
return stuff;
});
if (options.waitForSelector) {
const t0 = Date.now();
waitForPromise = nextSnapshotDeferred.promise.then(() => {
const t1 = Date.now();
const elapsed = t1 - t0;
const remaining = timeout - elapsed;
const thisTimeout = remaining > 100 ? remaining : 100;
const p = (Array.isArray(options.waitForSelector) ?
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
page.waitForSelector(options.waitForSelector, { timeout: thisTimeout }))
.then(() => {
successfullyDone = true;
})
.catch((err) => {
waitForPromise = undefined;
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
});
return p;
});
finalizationPromise = Promise.allSettled([waitForPromise, gotoPromise]).then(doFinalization);
}
else {
finalizationPromise = gotoPromise.then(doFinalization);
}
try {
let lastHTML = snapshot?.html;
while (true) {
const ckpt = [nextSnapshotDeferred.promise, waitForPromise ?? gotoPromise];
if (options.minIntervalMs) {
ckpt.push((0, timeout_1.delay)(options.minIntervalMs));
}
let error;
await Promise.race(ckpt).catch((err) => error = err);
if (successfullyDone && !error) {
if (!snapshot && !screenshot) {
throw new civ_rpc_1.AssertionFailureError(`Could not extract any meaningful content from the page`);
}
yield {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot
};
break;
}
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = (await this.takeScreenShot(page)) || screenshot;
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
yield {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot,
isIntermediate: true,
};
}
if (error) {
throw error;
}
if (successfullyDone) {
break;
}
}
await finalizationPromise;
yield {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot
};
}
finally {
this.pagePhase.set(page, 'background');
Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
nextSnapshotDeferred.resolve();
}
}
async takeScreenShot(page, opts) {
const r = await page.screenshot(opts).catch((err) => {
this.logger.warn(`Failed to take screenshot`, { err });
});
if (r) {
return Buffer.from(r);
}
return undefined;
}
async snapshotChildFrames(page) {
const childFrames = page.mainFrame().childFrames();
const r = await Promise.all(childFrames.map(async (x) => {
const thisUrl = x.url();
if (!thisUrl || thisUrl === 'about:blank') {
return undefined;
}
try {
await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME);
return await x.evaluate(`giveSnapshot()`);
}
catch (err) {
this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err });
return undefined;
}
}));
return r.filter(Boolean);
}
};
exports.PuppeteerControl = PuppeteerControl;
exports.PuppeteerControl = PuppeteerControl = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [logger_1.GlobalLogger,
async_context_1.AsyncLocalContext,
curl_1.CurlControl,
blackhole_detector_1.BlackHoleDetector])
], PuppeteerControl);
const puppeteerControl = tsyringe_1.container.resolve(PuppeteerControl);
exports.default = puppeteerControl;
//# sourceMappingURL=puppeteer.js.map