Spaces:
Build error
Build error
| ; | |
| var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | |
| if (k2 === undefined) k2 = k; | |
| var desc = Object.getOwnPropertyDescriptor(m, k); | |
| if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | |
| desc = { enumerable: true, get: function() { return m[k]; } }; | |
| } | |
| Object.defineProperty(o, k2, desc); | |
| }) : (function(o, m, k, k2) { | |
| if (k2 === undefined) k2 = k; | |
| o[k2] = m[k]; | |
| })); | |
| var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | |
| Object.defineProperty(o, "default", { enumerable: true, value: v }); | |
| }) : function(o, v) { | |
| o["default"] = v; | |
| }); | |
| var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { | |
| var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; | |
| if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); | |
| else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; | |
| return c > 3 && r && Object.defineProperty(target, key, r), r; | |
| }; | |
| var __importStar = (this && this.__importStar) || (function () { | |
| var ownKeys = function(o) { | |
| ownKeys = Object.getOwnPropertyNames || function (o) { | |
| var ar = []; | |
| for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; | |
| return ar; | |
| }; | |
| return ownKeys(o); | |
| }; | |
| return function (mod) { | |
| if (mod && mod.__esModule) return mod; | |
| var result = {}; | |
| if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); | |
| __setModuleDefault(result, mod); | |
| return result; | |
| }; | |
| })(); | |
| var __metadata = (this && this.__metadata) || function (k, v) { | |
| if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); | |
| }; | |
| var __importDefault = (this && this.__importDefault) || function (mod) { | |
| return (mod && mod.__esModule) ? mod : { "default": mod }; | |
| }; | |
| Object.defineProperty(exports, "__esModule", { value: true }); | |
| exports.PuppeteerControl = void 0; | |
| const lodash_1 = __importDefault(require("lodash")); | |
| const net_1 = require("net"); | |
| const promises_1 = require("fs/promises"); | |
| const fs_1 = __importDefault(require("fs")); | |
| const tsyringe_1 = require("tsyringe"); | |
| const puppeteer_1 = __importStar(require("puppeteer")); | |
| const defer_1 = require("civkit/defer"); | |
| const civ_rpc_1 = require("civkit/civ-rpc"); | |
| const async_service_1 = require("civkit/async-service"); | |
| const timeout_1 = require("civkit/timeout"); | |
| const errors_1 = require("../shared/lib/errors"); | |
| const curl_1 = require("./curl"); | |
| const blackhole_detector_1 = require("./blackhole-detector"); | |
| const async_context_1 = require("./async-context"); | |
| const logger_1 = require("./logger"); | |
| const minimal_stealth_1 = require("./minimal-stealth"); | |
| const tldExtract = require('tld-extract'); | |
| const READABILITY_JS = fs_1.default.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); | |
| const SIMULATE_SCROLL = ` | |
| (function () { | |
| function createIntersectionObserverEntry(target, isIntersecting, timestamp) { | |
| const targetRect = target.getBoundingClientRect(); | |
| const record = { | |
| target, | |
| isIntersecting, | |
| time: timestamp, | |
| // If intersecting, intersectionRect matches boundingClientRect | |
| // If not intersecting, intersectionRect is empty (0x0) | |
| intersectionRect: isIntersecting | |
| ? targetRect | |
| : new DOMRectReadOnly(0, 0, 0, 0), | |
| // Current bounding client rect of the target | |
| boundingClientRect: targetRect, | |
| // Intersection ratio is either 0 (not intersecting) or 1 (fully intersecting) | |
| intersectionRatio: isIntersecting ? 1 : 0, | |
| // Root bounds (viewport in our case) | |
| rootBounds: new DOMRectReadOnly( | |
| 0, | |
| 0, | |
| window.innerWidth, | |
| window.innerHeight | |
| ) | |
| }; | |
| Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype); | |
| return record; | |
| } | |
| function cloneIntersectionObserverEntry(entry) { | |
| const record = { | |
| target: entry.target, | |
| isIntersecting: entry.isIntersecting, | |
| time: entry.time, | |
| intersectionRect: entry.intersectionRect, | |
| boundingClientRect: entry.boundingClientRect, | |
| intersectionRatio: entry.intersectionRatio, | |
| rootBounds: entry.rootBounds | |
| }; | |
| Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype); | |
| return record; | |
| } | |
| const orig = window.IntersectionObserver; | |
| const kCallback = Symbol('callback'); | |
| const kLastEntryMap = Symbol('lastEntryMap'); | |
| const liveObservers = new Map(); | |
| class MangledIntersectionObserver extends orig { | |
| constructor(callback, options) { | |
| super((entries, observer) => { | |
| const lastEntryMap = observer[kLastEntryMap]; | |
| const lastEntry = entries[entries.length - 1]; | |
| lastEntryMap.set(lastEntry.target, lastEntry); | |
| return callback(entries, observer); | |
| }, options); | |
| this[kCallback] = callback; | |
| this[kLastEntryMap] = new WeakMap(); | |
| liveObservers.set(this, new Set()); | |
| } | |
| disconnect() { | |
| liveObservers.get(this)?.clear(); | |
| liveObservers.delete(this); | |
| return super.disconnect(); | |
| } | |
| observe(target) { | |
| const observer = liveObservers.get(this); | |
| observer?.add(target); | |
| return super.observe(target); | |
| } | |
| unobserve(target) { | |
| const observer = liveObservers.get(this); | |
| observer?.delete(target); | |
| return super.unobserve(target); | |
| } | |
| } | |
| Object.defineProperty(MangledIntersectionObserver, 'name', { value: 'IntersectionObserver', writable: false }); | |
| window.IntersectionObserver = MangledIntersectionObserver; | |
| function simulateScroll() { | |
| for (const [observer, targets] of liveObservers.entries()) { | |
| const t0 = performance.now(); | |
| for (const target of targets) { | |
| const entry = createIntersectionObserverEntry(target, true, t0); | |
| observer[kCallback]([entry], observer); | |
| setTimeout(() => { | |
| const t1 = performance.now(); | |
| const lastEntry = observer[kLastEntryMap].get(target); | |
| if (!lastEntry) { | |
| return; | |
| } | |
| const entry2 = { ...cloneIntersectionObserverEntry(lastEntry), time: t1 }; | |
| observer[kCallback]([entry2], observer); | |
| }); | |
| } | |
| } | |
| } | |
| window.simulateScroll = simulateScroll; | |
| })(); | |
| `; | |
| const MUTATION_IDLE_WATCH = ` | |
| (function () { | |
| let timeout; | |
| const sendMsg = ()=> { | |
| document.dispatchEvent(new CustomEvent('mutationIdle')); | |
| }; | |
| const cb = () => { | |
| if (timeout) { | |
| clearTimeout(timeout); | |
| timeout = setTimeout(sendMsg, 200); | |
| } | |
| }; | |
| const mutationObserver = new MutationObserver(cb); | |
| document.addEventListener('DOMContentLoaded', () => { | |
| mutationObserver.observe(document.documentElement, { | |
| childList: true, | |
| subtree: true, | |
| }); | |
| timeout = setTimeout(sendMsg, 200); | |
| }, { once: true }) | |
| })(); | |
| `; | |
| const SCRIPT_TO_INJECT_INTO_FRAME = ` | |
| ${READABILITY_JS} | |
| ${SIMULATE_SCROLL} | |
| ${MUTATION_IDLE_WATCH} | |
| (${minimal_stealth_1.minimalStealth.toString()})(); | |
| (function(){ | |
| function briefImgs(elem) { | |
| const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]')); | |
| return imageTags.map((x)=> { | |
| let linkPreferredSrc = x.src; | |
| if (linkPreferredSrc.startsWith('data:')) { | |
| if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) { | |
| linkPreferredSrc = x.dataset.src; | |
| } | |
| } | |
| return { | |
| src: new URL(linkPreferredSrc, document.baseURI).toString(), | |
| loaded: x.complete, | |
| width: x.width, | |
| height: x.height, | |
| naturalWidth: x.naturalWidth, | |
| naturalHeight: x.naturalHeight, | |
| alt: x.alt || x.title, | |
| }; | |
| }); | |
| } | |
| function getMaxDepthAndElemCountUsingTreeWalker(root=document.documentElement) { | |
| let maxDepth = 0; | |
| let currentDepth = 0; | |
| let elementCount = 0; | |
| const treeWalker = document.createTreeWalker( | |
| root, | |
| NodeFilter.SHOW_ELEMENT, | |
| (node) => { | |
| const nodeName = node.nodeName?.toLowerCase(); | |
| return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT; | |
| }, | |
| false | |
| ); | |
| while (true) { | |
| maxDepth = Math.max(maxDepth, currentDepth); | |
| elementCount++; // Increment the count for the current node | |
| if (treeWalker.firstChild()) { | |
| currentDepth++; | |
| } else { | |
| while (!treeWalker.nextSibling() && currentDepth > 0) { | |
| treeWalker.parentNode(); | |
| currentDepth--; | |
| } | |
| if (currentDepth <= 0) { | |
| break; | |
| } | |
| } | |
| } | |
| return { | |
| maxDepth: maxDepth + 1, | |
| elementCount: elementCount | |
| }; | |
| } | |
| function cloneAndExpandShadowRoots(rootElement = document.documentElement) { | |
| // Create a shallow clone of the root element | |
| const clone = rootElement.cloneNode(false); | |
| // Function to process an element and its shadow root | |
| function processShadowRoot(original, cloned) { | |
| if (original.shadowRoot && original.shadowRoot.mode === 'open') { | |
| shadowDomPresents = true; | |
| const shadowContent = document.createDocumentFragment(); | |
| // Clone shadow root content normally | |
| original.shadowRoot.childNodes.forEach(childNode => { | |
| const clonedNode = childNode.cloneNode(true); | |
| shadowContent.appendChild(clonedNode); | |
| }); | |
| // Handle slots | |
| const slots = shadowContent.querySelectorAll('slot'); | |
| slots.forEach(slot => { | |
| const slotName = slot.getAttribute('name') || ''; | |
| const assignedElements = original.querySelectorAll( | |
| slotName ? \`[slot="\${slotName}"]\` : ':not([slot])' | |
| ); | |
| if (assignedElements.length > 0) { | |
| const slotContent = document.createDocumentFragment(); | |
| assignedElements.forEach(el => { | |
| const clonedEl = el.cloneNode(true); | |
| slotContent.appendChild(clonedEl); | |
| }); | |
| slot.parentNode.replaceChild(slotContent, slot); | |
| } else if (!slotName) { | |
| // Keep default slot content | |
| // No need to do anything as it's already cloned | |
| } | |
| }); | |
| cloned.appendChild(shadowContent); | |
| } | |
| } | |
| // Use a TreeWalker on the original root to clone the entire structure | |
| const treeWalker = document.createTreeWalker( | |
| rootElement, | |
| NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT | |
| ); | |
| const elementMap = new Map([[rootElement, clone]]); | |
| let currentNode; | |
| while (currentNode = treeWalker.nextNode()) { | |
| const parentClone = elementMap.get(currentNode.parentNode); | |
| const clonedNode = currentNode.cloneNode(false); | |
| parentClone.appendChild(clonedNode); | |
| if (currentNode.nodeType === Node.ELEMENT_NODE) { | |
| elementMap.set(currentNode, clonedNode); | |
| processShadowRoot(currentNode, clonedNode); | |
| } | |
| } | |
| return clone; | |
| } | |
| function shadowDomPresent(rootElement = document.documentElement) { | |
| const elems = rootElement.querySelectorAll('*'); | |
| for (const x of elems) { | |
| if (x.shadowRoot && x.shadowRoot.mode === 'open') { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| let lastMutationIdle = 0; | |
| let initialAnalytics; | |
| document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now()); | |
| function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) { | |
| if (stopActiveSnapshot) { | |
| window.haltSnapshot = true; | |
| } | |
| let parsed; | |
| try { | |
| parsed = new Readability(document.cloneNode(true)).parse(); | |
| } catch (err) { | |
| void 0; | |
| } | |
| const domAnalysis = overrideDomAnalysis || getMaxDepthAndElemCountUsingTreeWalker(document.documentElement); | |
| initialAnalytics ??= domAnalysis; | |
| const thisElemCount = domAnalysis.elementCount; | |
| const initialElemCount = initialAnalytics.elementCount; | |
| Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) | |
| const r = { | |
| title: document.title, | |
| description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '', | |
| href: document.location.href, | |
| html: document.documentElement?.outerHTML, | |
| htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.05), | |
| text: document.body?.innerText, | |
| shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, | |
| parsed: parsed, | |
| imgs: [], | |
| maxElemDepth: domAnalysis.maxDepth, | |
| elemCount: domAnalysis.elementCount, | |
| lastMutationIdle, | |
| }; | |
| if (document.baseURI !== r.href) { | |
| r.rebase = document.baseURI; | |
| } | |
| r.imgs = briefImgs(); | |
| return r; | |
| } | |
| function waitForSelector(selectorText) { | |
| return new Promise((resolve) => { | |
| const existing = document.querySelector(selectorText); | |
| if (existing) { | |
| resolve(existing); | |
| return; | |
| } | |
| const observer = new MutationObserver(() => { | |
| const elem = document.querySelector(selectorText); | |
| if (elem) { | |
| resolve(document.querySelector(selectorText)); | |
| observer.disconnect(); | |
| } | |
| }); | |
| observer.observe(document.documentElement, { | |
| childList: true, | |
| subtree: true | |
| }); | |
| }); | |
| } | |
| window.getMaxDepthAndElemCountUsingTreeWalker = getMaxDepthAndElemCountUsingTreeWalker; | |
| window.waitForSelector = waitForSelector; | |
| window.giveSnapshot = giveSnapshot; | |
| window.briefImgs = briefImgs; | |
| })(); | |
| `; | |
| const documentResourceTypes = new Set([ | |
| 'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight' | |
| ]); | |
| const mediaResourceTypes = new Set([ | |
| 'stylesheet', 'image', 'font', 'media' | |
| ]); | |
| class PageReqCtrlKit { | |
| constructor(concurrency) { | |
| this.concurrency = concurrency; | |
| this.reqSet = new Set(); | |
| this.blockers = []; | |
| this.lastResourceLoadedAt = 0; | |
| this.lastContentResourceLoadedAt = 0; | |
| this.lastMediaResourceLoadedAt = 0; | |
| if (isNaN(concurrency) || concurrency < 1) { | |
| throw new civ_rpc_1.AssertionFailureError(`Invalid concurrency: ${concurrency}`); | |
| } | |
| } | |
| onNewRequest(req) { | |
| this.reqSet.add(req); | |
| if (this.reqSet.size <= this.concurrency) { | |
| return Promise.resolve(); | |
| } | |
| const deferred = (0, defer_1.Defer)(); | |
| this.blockers.push(deferred); | |
| return deferred.promise; | |
| } | |
| onFinishRequest(req) { | |
| this.reqSet.delete(req); | |
| const deferred = this.blockers.shift(); | |
| deferred?.resolve(); | |
| const now = Date.now(); | |
| this.lastResourceLoadedAt = now; | |
| // Beware req being undefined | |
| // https://pptr.dev/api/puppeteer.pageevent#:~:text=For%20certain%20requests%2C%20might%20contain%20undefined. | |
| const typ = req?.resourceType(); | |
| if (!typ) { | |
| return; | |
| } | |
| if (documentResourceTypes.has(typ)) { | |
| this.lastContentResourceLoadedAt = now; | |
| } | |
| if (mediaResourceTypes.has(typ)) { | |
| this.lastMediaResourceLoadedAt = now; | |
| } | |
| } | |
| } | |
| let PuppeteerControl = class PuppeteerControl extends async_service_1.AsyncService { | |
| constructor(globalLogger, asyncLocalContext, curlControl, blackHoleDetector) { | |
| super(...arguments); | |
| this.globalLogger = globalLogger; | |
| this.asyncLocalContext = asyncLocalContext; | |
| this.curlControl = curlControl; | |
| this.blackHoleDetector = blackHoleDetector; | |
| this._sn = 0; | |
| this.logger = this.globalLogger.child({ service: this.constructor.name }); | |
| this.__loadedPage = []; | |
| this.finalizerMap = new WeakMap(); | |
| this.snMap = new WeakMap(); | |
| this.livePages = new Set(); | |
| this.pagePhase = new WeakMap(); | |
| this.lastPageCratedAt = 0; | |
| this.ua = ''; | |
| this.effectiveUA = ''; | |
| this.concurrentRequestsPerPage = 32; | |
| this.pageReqCtrl = new WeakMap(); | |
| this.lastReqSentAt = 0; | |
| this.circuitBreakerHosts = new Set(); | |
| this.lifeCycleTrack = new WeakMap(); | |
| this.setMaxListeners(Infinity); | |
| let crippledTimes = 0; | |
| this.on('crippled', () => { | |
| crippledTimes += 1; | |
| this.__loadedPage.length = 0; | |
| this.livePages.clear(); | |
| if (crippledTimes > 5) { | |
| process.nextTick(() => { | |
| this.emit('error', new Error('Browser crashed too many times, quitting...')); | |
| // process.exit(1); | |
| }); | |
| } | |
| }); | |
| } | |
| async init() { | |
| await this.dependencyReady(); | |
| if (process.env.NODE_ENV?.includes('dry-run')) { | |
| this.emit('ready'); | |
| return; | |
| } | |
| if (this.browser) { | |
| if (this.browser.connected) { | |
| await this.browser.close(); | |
| } | |
| else { | |
| this.browser.process()?.kill('SIGKILL'); | |
| } | |
| } | |
| this.browser = await puppeteer_1.default.launch({ | |
| timeout: 10_000, | |
| headless: !Boolean(process.env.DEBUG_BROWSER), | |
| executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH, | |
| args: [ | |
| '--disable-dev-shm-usage', | |
| '--disable-blink-features=AutomationControlled' | |
| ] | |
| }).catch((err) => { | |
| this.logger.error(`Unknown firebase issue, just die fast.`, { err }); | |
| process.nextTick(() => { | |
| this.emit('error', err); | |
| // process.exit(1); | |
| }); | |
| return Promise.reject(err); | |
| }); | |
| this.browser.once('disconnected', () => { | |
| this.logger.warn(`Browser disconnected`); | |
| if (this.browser) { | |
| this.emit('crippled'); | |
| } | |
| process.nextTick(() => this.serviceReady()); | |
| }); | |
| this.ua = await this.browser.userAgent(); | |
| this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); | |
| this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'); | |
| this.curlControl.impersonateChrome(this.effectiveUA); | |
| await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); | |
| this.emit('ready'); | |
| } | |
| getRpsControlKit(page) { | |
| let kit = this.pageReqCtrl.get(page); | |
| if (!kit) { | |
| kit = new PageReqCtrlKit(this.concurrentRequestsPerPage); | |
| this.pageReqCtrl.set(page, kit); | |
| } | |
| return kit; | |
| } | |
| async newPage(bewareDeadLock = false) { | |
| if (!bewareDeadLock) { | |
| await this.serviceReady(); | |
| } | |
| const sn = this._sn++; | |
| let page; | |
| try { | |
| const dedicatedContext = await this.browser.createBrowserContext(); | |
| page = await dedicatedContext.newPage(); | |
| } | |
| catch (err) { | |
| this.logger.warn(`Failed to create page ${sn}`, { err }); | |
| this.browser.process()?.kill('SIGKILL'); | |
| throw new errors_1.ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`); | |
| } | |
| const preparations = []; | |
| preparations.push(page.setUserAgent(this.effectiveUA)); | |
| // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); | |
| // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); | |
| preparations.push(page.setBypassCSP(true)); | |
| preparations.push(page.setViewport({ width: 1024, height: 1024 })); | |
| preparations.push(page.exposeFunction('reportSnapshot', (snapshot) => { | |
| if (snapshot.href === 'about:blank') { | |
| return; | |
| } | |
| page.emit('snapshot', snapshot); | |
| })); | |
| preparations.push(page.exposeFunction('setViewport', (viewport) => { | |
| page.setViewport(viewport).catch(() => undefined); | |
| })); | |
| preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME)); | |
| preparations.push(page.setRequestInterception(true)); | |
| await Promise.all(preparations); | |
| await page.goto('about:blank', { waitUntil: 'domcontentloaded' }); | |
| const domainSet = new Set(); | |
| let reqCounter = 0; | |
| let t0; | |
| let halt = false; | |
| page.on('request', async (req) => { | |
| reqCounter++; | |
| if (halt) { | |
| return req.abort('blockedbyclient', 1000); | |
| } | |
| const requestUrl = req.url(); | |
| if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') { | |
| return req.abort('blockedbyclient', 1000); | |
| } | |
| t0 ??= Date.now(); | |
| const parsedUrl = new URL(requestUrl); | |
| if ((0, net_1.isIP)(parsedUrl.hostname)) { | |
| domainSet.add(parsedUrl.hostname); | |
| } | |
| else { | |
| try { | |
| const tldParsed = tldExtract(requestUrl); | |
| domainSet.add(tldParsed.domain); | |
| } | |
| catch (_err) { | |
| domainSet.add(parsedUrl.hostname); | |
| } | |
| } | |
| if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) { | |
| page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` }); | |
| return req.abort('blockedbyclient', 1000); | |
| } | |
| if (parsedUrl.hostname === 'localhost' || | |
| parsedUrl.hostname.startsWith('127.')) { | |
| page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` }); | |
| return req.abort('blockedbyclient', 1000); | |
| } | |
| const dt = Math.ceil((Date.now() - t0) / 1000); | |
| const rps = reqCounter / dt; | |
| // console.log(`rps: ${rps}`); | |
| const pagePhase = this.pagePhase.get(page); | |
| if (pagePhase === 'background') { | |
| if (rps > 10 || reqCounter > 1000) { | |
| halt = true; | |
| return req.abort('blockedbyclient', 1000); | |
| } | |
| } | |
| if (reqCounter > 1000) { | |
| if (rps > 60 || reqCounter > 2000) { | |
| page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` }); | |
| halt = true; | |
| return req.abort('blockedbyclient', 1000); | |
| } | |
| } | |
| if (domainSet.size > 200) { | |
| page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains` }); | |
| halt = true; | |
| return req.abort('blockedbyclient', 1000); | |
| } | |
| if (requestUrl.startsWith('http')) { | |
| const kit = this.getRpsControlKit(page); | |
| await kit.onNewRequest(req); | |
| } | |
| if (req.isInterceptResolutionHandled()) { | |
| return; | |
| } | |
| ; | |
| const continueArgs = req.continueRequestOverrides | |
| ? [req.continueRequestOverrides(), 0] | |
| : []; | |
| return req.continue(continueArgs[0], continueArgs[1]); | |
| }); | |
| const reqFinishHandler = (req) => { | |
| const kit = this.getRpsControlKit(page); | |
| kit.onFinishRequest(req); | |
| }; | |
| page.on('requestfinished', reqFinishHandler); | |
| page.on('requestfailed', reqFinishHandler); | |
| page.on('requestservedfromcache', reqFinishHandler); | |
| await page.evaluateOnNewDocument(` | |
| (function () { | |
| if (window.self === window.top) { | |
| let lastAnalytics; | |
| let lastReportedAt = 0; | |
| const handlePageLoad = () => { | |
| const now = Date.now(); | |
| const dt = now - lastReportedAt; | |
| const previousAnalytics = lastAnalytics; | |
| const thisAnalytics = getMaxDepthAndElemCountUsingTreeWalker(); | |
| let dElem = 0; | |
| if (window.haltSnapshot) { | |
| return; | |
| } | |
| const thisElemCount = thisAnalytics.elementCount; | |
| if (previousAnalytics) { | |
| const previousElemCount = previousAnalytics.elementCount; | |
| const delta = Math.abs(thisElemCount - previousElemCount); | |
| dElem = delta /(previousElemCount + Number.EPSILON); | |
| } | |
| if (dt < 1200 && dElem < 0.05) { | |
| return; | |
| } | |
| lastAnalytics = thisAnalytics; | |
| lastReportedAt = now; | |
| const r = giveSnapshot(false, lastAnalytics); | |
| window.reportSnapshot(r); | |
| }; | |
| document.addEventListener('readystatechange', ()=> { | |
| if (document.readyState === 'interactive') { | |
| handlePageLoad(); | |
| } | |
| }); | |
| document.addEventListener('load', handlePageLoad); | |
| window.addEventListener('load', handlePageLoad); | |
| document.addEventListener('DOMContentLoaded', handlePageLoad); | |
| document.addEventListener('mutationIdle', handlePageLoad); | |
| } | |
| document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true }); | |
| })(); | |
| `); | |
| this.snMap.set(page, sn); | |
| this.logger.debug(`Page ${sn} created.`); | |
| this.lastPageCratedAt = Date.now(); | |
| this.livePages.add(page); | |
| this.pagePhase.set(page, 'idle'); | |
| return page; | |
| } | |
| async getNextPage() { | |
| let thePage; | |
| if (this.__loadedPage.length) { | |
| thePage = this.__loadedPage.shift(); | |
| if (this.__loadedPage.length <= 1) { | |
| process.nextTick(() => { | |
| this.newPage() | |
| .then((r) => this.__loadedPage.push(r)) | |
| .catch((err) => { | |
| this.logger.warn(`Failed to load new page ahead of time`, { err }); | |
| }); | |
| }); | |
| } | |
| } | |
| if (!thePage) { | |
| thePage = await this.newPage(); | |
| } | |
| const timer = setTimeout(() => { | |
| this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage)}...`); | |
| this.ditchPage(thePage); | |
| }, 300 * 1000); | |
| this.finalizerMap.set(thePage, timer); | |
| return thePage; | |
| } | |
| async ditchPage(page) { | |
| if (this.finalizerMap.has(page)) { | |
| clearTimeout(this.finalizerMap.get(page)); | |
| this.finalizerMap.delete(page); | |
| } | |
| if (page.isClosed()) { | |
| return; | |
| } | |
| const sn = this.snMap.get(page); | |
| this.logger.debug(`Closing page ${sn}`); | |
| await Promise.race([ | |
| (async () => { | |
| const ctx = page.browserContext(); | |
| try { | |
| await page.close(); | |
| } | |
| finally { | |
| await ctx.close(); | |
| } | |
| })(), | |
| (0, timeout_1.delay)(5000) | |
| ]).catch((err) => { | |
| this.logger.error(`Failed to destroy page ${sn}`, { err }); | |
| }); | |
| this.livePages.delete(page); | |
| this.pagePhase.delete(page); | |
| } | |
| async *scrap(parsedUrl, options = {}) { | |
| // parsedUrl.search = ''; | |
| const url = parsedUrl.toString(); | |
| let snapshot; | |
| let screenshot; | |
| let pageshot; | |
| const pdfUrls = []; | |
| let navigationResponse; | |
| const page = await this.getNextPage(); | |
| this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx); | |
| this.pagePhase.set(page, 'active'); | |
| page.on('response', (resp) => { | |
| this.blackHoleDetector.itWorked(); | |
| const req = resp.request(); | |
| if (req.frame() === page.mainFrame() && req.isNavigationRequest()) { | |
| navigationResponse = resp; | |
| } | |
| if (!resp.ok()) { | |
| return; | |
| } | |
| const headers = resp.headers(); | |
| const url = resp.url(); | |
| const contentType = headers['content-type']; | |
| if (contentType?.toLowerCase().includes('application/pdf')) { | |
| pdfUrls.push(url); | |
| } | |
| }); | |
| page.on('request', async (req) => { | |
| if (req.isInterceptResolutionHandled()) { | |
| return; | |
| } | |
| ; | |
| const reqUrlParsed = new URL(req.url()); | |
| if (!reqUrlParsed.protocol.startsWith('http')) { | |
| const overrides = req.continueRequestOverrides(); | |
| return req.continue(overrides, 0); | |
| } | |
| const typ = req.resourceType(); | |
| if (typ === 'media') { | |
| // Non-cooperative answer to block all media requests. | |
| return req.abort('blockedbyclient'); | |
| } | |
| if (!options.proxyResources) { | |
| const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ); | |
| if (!isDocRequest) { | |
| if (options.extraHeaders) { | |
| const overrides = req.continueRequestOverrides(); | |
| const continueArgs = [{ | |
| ...overrides, | |
| headers: { | |
| ...req.headers(), | |
| ...overrides?.headers, | |
| ...options.extraHeaders, | |
| } | |
| }, 1]; | |
| return req.continue(continueArgs[0], continueArgs[1]); | |
| } | |
| const overrides = req.continueRequestOverrides(); | |
| return req.continue(overrides, 0); | |
| } | |
| } | |
| const sideload = options.sideLoad; | |
| const impersonate = sideload?.impersonate[reqUrlParsed.href]; | |
| if (impersonate) { | |
| let body; | |
| if (impersonate.body) { | |
| body = await (0, promises_1.readFile)(await impersonate.body.filePath); | |
| if (req.isInterceptResolutionHandled()) { | |
| return; | |
| } | |
| } | |
| return req.respond({ | |
| status: impersonate.status, | |
| headers: impersonate.headers, | |
| contentType: impersonate.contentType, | |
| body: body ? Uint8Array.from(body) : undefined, | |
| }, 999); | |
| } | |
| const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin]; | |
| const ctx = this.lifeCycleTrack.get(page); | |
| if (proxy && ctx) { | |
| return await this.asyncLocalContext.bridge(ctx, async () => { | |
| try { | |
| const curled = await this.curlControl.sideLoad(reqUrlParsed, { | |
| ...options, | |
| method: req.method(), | |
| body: req.postData(), | |
| extraHeaders: { | |
| ...req.headers(), | |
| ...options.extraHeaders, | |
| }, | |
| proxyUrl: proxy | |
| }); | |
| if (req.isInterceptResolutionHandled()) { | |
| return; | |
| } | |
| ; | |
| if (curled.chain.length === 1) { | |
| if (!curled.file) { | |
| return req.respond({ | |
| status: curled.status, | |
| headers: lodash_1.default.omit(curled.headers, 'result'), | |
| contentType: curled.contentType, | |
| }, 3); | |
| } | |
| const body = await (0, promises_1.readFile)(await curled.file.filePath); | |
| if (req.isInterceptResolutionHandled()) { | |
| return; | |
| } | |
| ; | |
| return req.respond({ | |
| status: curled.status, | |
| headers: lodash_1.default.omit(curled.headers, 'result'), | |
| contentType: curled.contentType, | |
| body: Uint8Array.from(body), | |
| }, 3); | |
| } | |
| options.sideLoad ??= curled.sideLoadOpts; | |
| lodash_1.default.merge(options.sideLoad, curled.sideLoadOpts); | |
| const firstReq = curled.chain[0]; | |
| return req.respond({ | |
| status: firstReq.result.code, | |
| headers: lodash_1.default.omit(firstReq, 'result'), | |
| }, 3); | |
| } | |
| catch (err) { | |
| this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy }); | |
| } | |
| if (req.isInterceptResolutionHandled()) { | |
| return; | |
| } | |
| ; | |
| const overrides = req.continueRequestOverrides(); | |
| const continueArgs = [{ | |
| ...overrides, | |
| headers: { | |
| ...req.headers(), | |
| ...overrides?.headers, | |
| ...options.extraHeaders, | |
| } | |
| }, 1]; | |
| return req.continue(continueArgs[0], continueArgs[1]); | |
| }); | |
| } | |
| if (req.isInterceptResolutionHandled()) { | |
| return; | |
| } | |
| ; | |
| const overrides = req.continueRequestOverrides(); | |
| const continueArgs = [{ | |
| ...overrides, | |
| headers: { | |
| ...req.headers(), | |
| ...overrides?.headers, | |
| ...options.extraHeaders, | |
| } | |
| }, 1]; | |
| return req.continue(continueArgs[0], continueArgs[1]); | |
| }); | |
| let pageScriptEvaluations = []; | |
| let frameScriptEvaluations = []; | |
| if (options.injectPageScripts?.length) { | |
| page.on('framenavigated', (frame) => { | |
| if (frame !== page.mainFrame()) { | |
| return; | |
| } | |
| pageScriptEvaluations.push(Promise.allSettled(options.injectPageScripts.map((x) => frame.evaluate(x).catch((err) => { | |
| this.logger.warn(`Error in evaluation of page scripts`, { err }); | |
| })))); | |
| }); | |
| } | |
| if (options.injectFrameScripts?.length) { | |
| page.on('framenavigated', (frame) => { | |
| frameScriptEvaluations.push(Promise.allSettled(options.injectFrameScripts.map((x) => frame.evaluate(x).catch((err) => { | |
| this.logger.warn(`Error in evaluation of frame scripts`, { err }); | |
| })))); | |
| }); | |
| } | |
| const sn = this.snMap.get(page); | |
| this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); | |
| if (options.locale) { | |
| // Add headers via request interception to walk around this bug | |
| // https://github.com/puppeteer/puppeteer/issues/10235 | |
| // await page.setExtraHTTPHeaders({ | |
| // 'Accept-Language': options.locale | |
| // }); | |
| await page.evaluateOnNewDocument(() => { | |
| Object.defineProperty(navigator, "language", { | |
| get: function () { | |
| return options.locale; | |
| } | |
| }); | |
| Object.defineProperty(navigator, "languages", { | |
| get: function () { | |
| return [options.locale]; | |
| } | |
| }); | |
| }); | |
| } | |
| if (options.cookies) { | |
| const mapped = options.cookies.map((x) => { | |
| const draft = { | |
| name: x.name, | |
| value: encodeURIComponent(x.value), | |
| secure: x.secure, | |
| domain: x.domain, | |
| path: x.path, | |
| expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined, | |
| sameSite: x.sameSite, | |
| }; | |
| if (!draft.expires && x.maxAge) { | |
| draft.expires = Math.floor(Date.now() / 1000) + x.maxAge; | |
| } | |
| if (!draft.domain) { | |
| draft.url = parsedUrl.toString(); | |
| } | |
| return draft; | |
| }); | |
| try { | |
| await page.setCookie(...mapped); | |
| } | |
| catch (err) { | |
| this.logger.warn(`Page ${sn}: Failed to set cookies`, { err }); | |
| throw new civ_rpc_1.ParamValidationError({ | |
| path: 'cookies', | |
| message: `Failed to set cookies: ${err?.message}` | |
| }); | |
| } | |
| } | |
| if (options.overrideUserAgent) { | |
| await page.setUserAgent(options.overrideUserAgent); | |
| } | |
| if (options.viewport) { | |
| await page.setViewport(options.viewport); | |
| } | |
| let nextSnapshotDeferred = (0, defer_1.Defer)(); | |
| const crippleListener = () => nextSnapshotDeferred.reject(new errors_1.ServiceCrashedError({ message: `Browser crashed, try again` })); | |
| this.once('crippled', crippleListener); | |
| nextSnapshotDeferred.promise.finally(() => { | |
| this.off('crippled', crippleListener); | |
| }); | |
| let successfullyDone; | |
| const hdl = (s) => { | |
| if (snapshot === s) { | |
| return; | |
| } | |
| snapshot = s; | |
| if (snapshot) { | |
| const kit = this.pageReqCtrl.get(page); | |
| snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt; | |
| snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt; | |
| } | |
| if (s?.maxElemDepth && s.maxElemDepth > 256) { | |
| return; | |
| } | |
| if (s?.elemCount && s.elemCount > 10_000) { | |
| return; | |
| } | |
| nextSnapshotDeferred.resolve(s); | |
| nextSnapshotDeferred = (0, defer_1.Defer)(); | |
| this.once('crippled', crippleListener); | |
| nextSnapshotDeferred.promise.finally(() => { | |
| this.off('crippled', crippleListener); | |
| }); | |
| }; | |
| page.on('snapshot', hdl); | |
| page.once('abuse', (event) => { | |
| this.emit('abuse', { ...event, url: parsedUrl }); | |
| if (snapshot?.href && parsedUrl.href !== snapshot.href) { | |
| this.emit('abuse', { ...event, url: snapshot.href }); | |
| } | |
| nextSnapshotDeferred.reject(new errors_1.SecurityCompromiseError(`Abuse detected: ${event.reason}`)); | |
| }); | |
| const timeout = options.timeoutMs || 30_000; | |
| const goToOptions = { | |
| waitUntil: ['load', 'domcontentloaded', 'networkidle0'], | |
| timeout, | |
| }; | |
| if (options.referer) { | |
| goToOptions.referer = options.referer; | |
| } | |
| let waitForPromise; | |
| let finalizationPromise; | |
| const doFinalization = async () => { | |
| if (waitForPromise) { | |
| // SuccessfullyDone is meant for the finish of the page. | |
| // It doesn't matter if you are expecting something and it didn't show up. | |
| await waitForPromise.catch(() => void 0); | |
| } | |
| successfullyDone ??= true; | |
| try { | |
| const pSubFrameSnapshots = this.snapshotChildFrames(page); | |
| snapshot = await page.evaluate('giveSnapshot(true)'); | |
| screenshot = (await this.takeScreenShot(page)) || screenshot; | |
| pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; | |
| if (snapshot) { | |
| snapshot.childFrames = await pSubFrameSnapshots; | |
| } | |
| } | |
| catch (err) { | |
| this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); | |
| } | |
| if (!snapshot?.html) { | |
| return; | |
| } | |
| this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); | |
| this.emit('crawled', { | |
| ...snapshot, | |
| status: navigationResponse?.status(), | |
| statusText: navigationResponse?.statusText(), | |
| pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot, | |
| }, { ...options, url: parsedUrl }); | |
| }; | |
| const delayPromise = (0, timeout_1.delay)(timeout); | |
| const gotoPromise = page.goto(url, goToOptions) | |
| .catch((err) => { | |
| if (err instanceof puppeteer_1.TimeoutError) { | |
| this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err }); | |
| return new civ_rpc_1.AssertionFailureError({ | |
| message: `Failed to goto ${url}: ${err}`, | |
| cause: err, | |
| }); | |
| } | |
| if (err?.message?.startsWith('net::ERR_ABORTED')) { | |
| if (pdfUrls.length) { | |
| // Not throw for pdf mode. | |
| return; | |
| } | |
| } | |
| this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err }); | |
| return new civ_rpc_1.AssertionFailureError({ | |
| message: `Failed to goto ${url}: ${err}`, | |
| cause: err, | |
| }); | |
| }).then(async (stuff) => { | |
| // This check is necessary because without snapshot, the condition of the page is unclear | |
| // Calling evaluate directly may stall the process. | |
| if (!snapshot) { | |
| if (stuff instanceof Error) { | |
| throw stuff; | |
| } | |
| } | |
| await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise]) | |
| .catch(() => void 0); | |
| return stuff; | |
| }); | |
| if (options.waitForSelector) { | |
| const t0 = Date.now(); | |
| waitForPromise = nextSnapshotDeferred.promise.then(() => { | |
| const t1 = Date.now(); | |
| const elapsed = t1 - t0; | |
| const remaining = timeout - elapsed; | |
| const thisTimeout = remaining > 100 ? remaining : 100; | |
| const p = (Array.isArray(options.waitForSelector) ? | |
| Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) : | |
| page.waitForSelector(options.waitForSelector, { timeout: thisTimeout })) | |
| .then(() => { | |
| successfullyDone = true; | |
| }) | |
| .catch((err) => { | |
| waitForPromise = undefined; | |
| this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err }); | |
| }); | |
| return p; | |
| }); | |
| finalizationPromise = Promise.allSettled([waitForPromise, gotoPromise]).then(doFinalization); | |
| } | |
| else { | |
| finalizationPromise = gotoPromise.then(doFinalization); | |
| } | |
| try { | |
| let lastHTML = snapshot?.html; | |
| while (true) { | |
| const ckpt = [nextSnapshotDeferred.promise, waitForPromise ?? gotoPromise]; | |
| if (options.minIntervalMs) { | |
| ckpt.push((0, timeout_1.delay)(options.minIntervalMs)); | |
| } | |
| let error; | |
| await Promise.race(ckpt).catch((err) => error = err); | |
| if (successfullyDone && !error) { | |
| if (!snapshot && !screenshot) { | |
| throw new civ_rpc_1.AssertionFailureError(`Could not extract any meaningful content from the page`); | |
| } | |
| yield { | |
| ...snapshot, | |
| status: navigationResponse?.status(), | |
| statusText: navigationResponse?.statusText(), | |
| pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot | |
| }; | |
| break; | |
| } | |
| if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { | |
| screenshot = (await this.takeScreenShot(page)) || screenshot; | |
| pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; | |
| lastHTML = snapshot.html; | |
| } | |
| if (snapshot || screenshot) { | |
| yield { | |
| ...snapshot, | |
| status: navigationResponse?.status(), | |
| statusText: navigationResponse?.statusText(), | |
| pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot, | |
| isIntermediate: true, | |
| }; | |
| } | |
| if (error) { | |
| throw error; | |
| } | |
| if (successfullyDone) { | |
| break; | |
| } | |
| } | |
| await finalizationPromise; | |
| yield { | |
| ...snapshot, | |
| status: navigationResponse?.status(), | |
| statusText: navigationResponse?.statusText(), | |
| pdfs: lodash_1.default.uniq(pdfUrls), screenshot, pageshot | |
| }; | |
| } | |
| finally { | |
| this.pagePhase.set(page, 'background'); | |
| Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => { | |
| page.off('snapshot', hdl); | |
| this.ditchPage(page); | |
| }); | |
| nextSnapshotDeferred.resolve(); | |
| } | |
| } | |
| async takeScreenShot(page, opts) { | |
| const r = await page.screenshot(opts).catch((err) => { | |
| this.logger.warn(`Failed to take screenshot`, { err }); | |
| }); | |
| if (r) { | |
| return Buffer.from(r); | |
| } | |
| return undefined; | |
| } | |
| async snapshotChildFrames(page) { | |
| const childFrames = page.mainFrame().childFrames(); | |
| const r = await Promise.all(childFrames.map(async (x) => { | |
| const thisUrl = x.url(); | |
| if (!thisUrl || thisUrl === 'about:blank') { | |
| return undefined; | |
| } | |
| try { | |
| await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME); | |
| return await x.evaluate(`giveSnapshot()`); | |
| } | |
| catch (err) { | |
| this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err }); | |
| return undefined; | |
| } | |
| })); | |
| return r.filter(Boolean); | |
| } | |
| }; | |
| exports.PuppeteerControl = PuppeteerControl; | |
| exports.PuppeteerControl = PuppeteerControl = __decorate([ | |
| (0, tsyringe_1.singleton)(), | |
| __metadata("design:paramtypes", [logger_1.GlobalLogger, | |
| async_context_1.AsyncLocalContext, | |
| curl_1.CurlControl, | |
| blackhole_detector_1.BlackHoleDetector]) | |
| ], PuppeteerControl); | |
| const puppeteerControl = tsyringe_1.container.resolve(PuppeteerControl); | |
| exports.default = puppeteerControl; | |
| //# sourceMappingURL=puppeteer.js.map |