nomagick commited on
Commit
0da71ca
·
unverified ·
1 Parent(s): 4e5abd3

fix: robots-txt not loaded error conditions

Browse files
src/dto/crawler-options.ts CHANGED
@@ -134,6 +134,16 @@ class Viewport extends AutoCastable {
134
  in: 'header',
135
  schema: { type: 'string' }
136
  },
 
 
 
 
 
 
 
 
 
 
137
  'X-Set-Cookie': {
138
  description: `Sets cookie(s) to the headless browser for your request. \n\n` +
139
  `Syntax is the same with standard Set-Cookie`,
 
134
  in: 'header',
135
  schema: { type: 'string' }
136
  },
137
+ 'X-Robots-Txt': {
138
+ description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`,
139
+ in: 'header',
140
+ schema: { type: 'string' }
141
+ },
142
+ 'DNT': {
143
+ description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`,
144
+ in: 'header',
145
+ schema: { type: 'string' }
146
+ },
147
  'X-Set-Cookie': {
148
  description: `Sets cookie(s) to the headless browser for your request. \n\n` +
149
  `Syntax is the same with standard Set-Cookie`,
src/services/robots-text.ts CHANGED
@@ -1,6 +1,6 @@
1
  import { singleton } from 'tsyringe';
2
  import { URL } from 'url';
3
- import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
4
  import { AsyncService } from 'civkit/async-service';
5
  import { HashManager } from 'civkit/hash';
6
  import { marshalErrorLike } from 'civkit/lang';
@@ -40,7 +40,7 @@ export class RobotsTxtService extends AsyncService {
40
 
41
  const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
42
  if (!r.ok) {
43
- throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`);
44
  }
45
  buff = Buffer.from(await r.arrayBuffer());
46
 
@@ -60,9 +60,10 @@ export class RobotsTxtService extends AsyncService {
60
  robotTxt = await this.getCachedRobotTxt(url.origin);
61
  } catch (err) {
62
  if (err instanceof DownstreamServiceFailureError) {
 
63
  return true;
64
  }
65
- throw err;
66
  }
67
  const myUa = inputMyUa.toLowerCase();
68
  const lines = robotTxt.split(/\r?\n/g);
 
1
  import { singleton } from 'tsyringe';
2
  import { URL } from 'url';
3
+ import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
4
  import { AsyncService } from 'civkit/async-service';
5
  import { HashManager } from 'civkit/hash';
6
  import { marshalErrorLike } from 'civkit/lang';
 
40
 
41
  const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
42
  if (!r.ok) {
43
+ throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
44
  }
45
  buff = Buffer.from(await r.arrayBuffer());
46
 
 
60
  robotTxt = await this.getCachedRobotTxt(url.origin);
61
  } catch (err) {
62
  if (err instanceof DownstreamServiceFailureError) {
63
+ // Remote server is reachable but cannot provide a robot.txt; this is treated as public access
64
  return true;
65
  }
66
+ throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
67
  }
68
  const myUa = inputMyUa.toLowerCase();
69
  const lines = robotTxt.split(/\r?\n/g);