Spaces:
Build error
Build error
fix: robots-txt not loaded error conditions
Browse files- src/dto/crawler-options.ts +10 -0
- src/services/robots-text.ts +4 -3
src/dto/crawler-options.ts
CHANGED
|
@@ -134,6 +134,16 @@ class Viewport extends AutoCastable {
|
|
| 134 |
in: 'header',
|
| 135 |
schema: { type: 'string' }
|
| 136 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
'X-Set-Cookie': {
|
| 138 |
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
| 139 |
`Syntax is the same with standard Set-Cookie`,
|
|
|
|
| 134 |
in: 'header',
|
| 135 |
schema: { type: 'string' }
|
| 136 |
},
|
| 137 |
+
'X-Robots-Txt': {
|
| 138 |
+
description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`,
|
| 139 |
+
in: 'header',
|
| 140 |
+
schema: { type: 'string' }
|
| 141 |
+
},
|
| 142 |
+
'DNT': {
|
| 143 |
+
description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`,
|
| 144 |
+
in: 'header',
|
| 145 |
+
schema: { type: 'string' }
|
| 146 |
+
},
|
| 147 |
'X-Set-Cookie': {
|
| 148 |
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
| 149 |
`Syntax is the same with standard Set-Cookie`,
|
src/services/robots-text.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { singleton } from 'tsyringe';
|
| 2 |
import { URL } from 'url';
|
| 3 |
-
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
| 4 |
import { AsyncService } from 'civkit/async-service';
|
| 5 |
import { HashManager } from 'civkit/hash';
|
| 6 |
import { marshalErrorLike } from 'civkit/lang';
|
|
@@ -40,7 +40,7 @@ export class RobotsTxtService extends AsyncService {
|
|
| 40 |
|
| 41 |
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
|
| 42 |
if (!r.ok) {
|
| 43 |
-
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`);
|
| 44 |
}
|
| 45 |
buff = Buffer.from(await r.arrayBuffer());
|
| 46 |
|
|
@@ -60,9 +60,10 @@ export class RobotsTxtService extends AsyncService {
|
|
| 60 |
robotTxt = await this.getCachedRobotTxt(url.origin);
|
| 61 |
} catch (err) {
|
| 62 |
if (err instanceof DownstreamServiceFailureError) {
|
|
|
|
| 63 |
return true;
|
| 64 |
}
|
| 65 |
-
throw err;
|
| 66 |
}
|
| 67 |
const myUa = inputMyUa.toLowerCase();
|
| 68 |
const lines = robotTxt.split(/\r?\n/g);
|
|
|
|
| 1 |
import { singleton } from 'tsyringe';
|
| 2 |
import { URL } from 'url';
|
| 3 |
+
import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
| 4 |
import { AsyncService } from 'civkit/async-service';
|
| 5 |
import { HashManager } from 'civkit/hash';
|
| 6 |
import { marshalErrorLike } from 'civkit/lang';
|
|
|
|
| 40 |
|
| 41 |
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
|
| 42 |
if (!r.ok) {
|
| 43 |
+
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
|
| 44 |
}
|
| 45 |
buff = Buffer.from(await r.arrayBuffer());
|
| 46 |
|
|
|
|
| 60 |
robotTxt = await this.getCachedRobotTxt(url.origin);
|
| 61 |
} catch (err) {
|
| 62 |
if (err instanceof DownstreamServiceFailureError) {
|
| 63 |
+
// Remote server is reachable but cannot provide a robot.txt; this is treated as public access
|
| 64 |
return true;
|
| 65 |
}
|
| 66 |
+
throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
|
| 67 |
}
|
| 68 |
const myUa = inputMyUa.toLowerCase();
|
| 69 |
const lines = robotTxt.split(/\r?\n/g);
|