"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var _a; Object.defineProperty(exports, "__esModule", { value: true }); exports.RobotsTxtService = exports.md5Hasher = void 0; const tsyringe_1 = require("tsyringe"); const url_1 = require("url"); const civ_rpc_1 = require("civkit/civ-rpc"); const async_service_1 = require("civkit/async-service"); const hash_1 = require("civkit/hash"); const lang_1 = require("civkit/lang"); const logger_1 = require("./logger"); const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket"); const threaded_1 = require("../services/threaded"); exports.md5Hasher = new hash_1.HashManager('md5', 'hex'); let RobotsTxtService = class RobotsTxtService extends async_service_1.AsyncService { constructor(globalLogger, firebaseStorageBucketControl) { super(...arguments); this.globalLogger = globalLogger; this.firebaseStorageBucketControl = firebaseStorageBucketControl; this.logger = this.globalLogger.child({ service: this.constructor.name }); } async init() { await this.dependencyReady(); this.emit('ready'); } async getCachedRobotTxt(origin) { const digest = exports.md5Hasher.hash(origin.toLowerCase()); const cacheLoc = `robots-txt/${digest}`; let buff; buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined); if (buff) { return buff.toString(); } const r = await fetch(new url_1.URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) }); if (!r.ok) { throw new civ_rpc_1.DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`); } buff = Buffer.from(await r.arrayBuffer()); this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, { contentType: 'text/plain' }).catch((err) => { this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: (0, lang_1.marshalErrorLike)(err) }); }); return buff.toString(); } async assertAccessAllowed(url, inputMyUa = '*') { let robotTxt = ''; try { robotTxt = await this.getCachedRobotTxt(url.origin); } catch (err) { if (err instanceof civ_rpc_1.DownstreamServiceFailureError) { // Remote server is reachable but cannot provide a robot.txt; this is treated as public access return true; } throw new civ_rpc_1.AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`); } const myUa = inputMyUa.toLowerCase(); const lines = robotTxt.split(/\r?\n/g); let currentUa = myUa || '*'; let uaLine = 'User-Agent: *'; const pathNormalized = `${url.pathname}?`; for (const line of lines) { const trimmed = line.trim(); if (trimmed.startsWith('#') || !trimmed) { continue; } const [k, ...rest] = trimmed.split(':'); const key = k.trim().toLowerCase(); const value = rest.join(':').trim(); if (key === 'user-agent') { currentUa = value.toLowerCase(); if (value === '*') { currentUa = myUa; } uaLine = line; continue; } if (currentUa !== myUa) { continue; } if (key === 'disallow') { if (!value) { return true; } if (value.includes('*')) { const [head, tail] = value.split('*'); if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) { throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); } } else if (pathNormalized.startsWith(value)) { throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); } continue; } if (key === 'allow') { if (!value) { return true; } if (pathNormalized.startsWith(value)) { return true; } continue; } } return true; } }; exports.RobotsTxtService = RobotsTxtService; __decorate([ (0, threaded_1.Threaded)(), __metadata("design:type", Function), __metadata("design:paramtypes", [typeof (_a = typeof url_1.URL !== "undefined" && url_1.URL) === "function" ? _a : Object, Object]), __metadata("design:returntype", Promise) ], RobotsTxtService.prototype, "assertAccessAllowed", null); exports.RobotsTxtService = RobotsTxtService = __decorate([ (0, tsyringe_1.singleton)(), __metadata("design:paramtypes", [logger_1.GlobalLogger, firebase_storage_bucket_1.FirebaseStorageBucketControl]) ], RobotsTxtService); //# sourceMappingURL=robots-text.js.map