web_reader / build /services /robots-text.js
Mohammad Shahid
Include pre-built files for HF deployment
f316cce
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var _a;
Object.defineProperty(exports, "__esModule", { value: true });
exports.RobotsTxtService = exports.md5Hasher = void 0;
const tsyringe_1 = require("tsyringe");
const url_1 = require("url");
const civ_rpc_1 = require("civkit/civ-rpc");
const async_service_1 = require("civkit/async-service");
const hash_1 = require("civkit/hash");
const lang_1 = require("civkit/lang");
const logger_1 = require("./logger");
const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket");
const threaded_1 = require("../services/threaded");
exports.md5Hasher = new hash_1.HashManager('md5', 'hex');
let RobotsTxtService = class RobotsTxtService extends async_service_1.AsyncService {
constructor(globalLogger, firebaseStorageBucketControl) {
super(...arguments);
this.globalLogger = globalLogger;
this.firebaseStorageBucketControl = firebaseStorageBucketControl;
this.logger = this.globalLogger.child({ service: this.constructor.name });
}
async init() {
await this.dependencyReady();
this.emit('ready');
}
async getCachedRobotTxt(origin) {
const digest = exports.md5Hasher.hash(origin.toLowerCase());
const cacheLoc = `robots-txt/${digest}`;
let buff;
buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
if (buff) {
return buff.toString();
}
const r = await fetch(new url_1.URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
if (!r.ok) {
throw new civ_rpc_1.DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
}
buff = Buffer.from(await r.arrayBuffer());
this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
contentType: 'text/plain'
}).catch((err) => {
this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: (0, lang_1.marshalErrorLike)(err) });
});
return buff.toString();
}
async assertAccessAllowed(url, inputMyUa = '*') {
let robotTxt = '';
try {
robotTxt = await this.getCachedRobotTxt(url.origin);
}
catch (err) {
if (err instanceof civ_rpc_1.DownstreamServiceFailureError) {
// Remote server is reachable but cannot provide a robot.txt; this is treated as public access
return true;
}
throw new civ_rpc_1.AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
}
const myUa = inputMyUa.toLowerCase();
const lines = robotTxt.split(/\r?\n/g);
let currentUa = myUa || '*';
let uaLine = 'User-Agent: *';
const pathNormalized = `${url.pathname}?`;
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.startsWith('#') || !trimmed) {
continue;
}
const [k, ...rest] = trimmed.split(':');
const key = k.trim().toLowerCase();
const value = rest.join(':').trim();
if (key === 'user-agent') {
currentUa = value.toLowerCase();
if (value === '*') {
currentUa = myUa;
}
uaLine = line;
continue;
}
if (currentUa !== myUa) {
continue;
}
if (key === 'disallow') {
if (!value) {
return true;
}
if (value.includes('*')) {
const [head, tail] = value.split('*');
if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
}
else if (pathNormalized.startsWith(value)) {
throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
continue;
}
if (key === 'allow') {
if (!value) {
return true;
}
if (pathNormalized.startsWith(value)) {
return true;
}
continue;
}
}
return true;
}
};
exports.RobotsTxtService = RobotsTxtService;
__decorate([
(0, threaded_1.Threaded)(),
__metadata("design:type", Function),
__metadata("design:paramtypes", [typeof (_a = typeof url_1.URL !== "undefined" && url_1.URL) === "function" ? _a : Object, Object]),
__metadata("design:returntype", Promise)
], RobotsTxtService.prototype, "assertAccessAllowed", null);
exports.RobotsTxtService = RobotsTxtService = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [logger_1.GlobalLogger,
firebase_storage_bucket_1.FirebaseStorageBucketControl])
], RobotsTxtService);
//# sourceMappingURL=robots-text.js.map