Spaces:
Build error
Build error
feat: md options pass though to turndown
Browse files- package-lock.json +4 -4
- package.json +1 -1
- src/api/crawler.ts +3 -0
- src/dto/crawler-options.ts +43 -0
- src/dto/turndown-tweakable-options.ts +61 -0
- src/services/snapshot-formatter.ts +2 -0
package-lock.json
CHANGED
|
@@ -17,7 +17,7 @@
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
-
"civkit": "^0.8.4-
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
@@ -4095,9 +4095,9 @@
|
|
| 4095 |
}
|
| 4096 |
},
|
| 4097 |
"node_modules/civkit": {
|
| 4098 |
-
"version": "0.8.4-
|
| 4099 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-
|
| 4100 |
-
"integrity": "sha512-
|
| 4101 |
"license": "AGPL",
|
| 4102 |
"dependencies": {
|
| 4103 |
"lodash": "^4.17.21",
|
|
|
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
+
"civkit": "^0.8.4-5f839a7",
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
|
|
| 4095 |
}
|
| 4096 |
},
|
| 4097 |
"node_modules/civkit": {
|
| 4098 |
+
"version": "0.8.4-5f839a7",
|
| 4099 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-5f839a7.tgz",
|
| 4100 |
+
"integrity": "sha512-wF9Sm0dKBNGTXtueYtmwqreciilEw2+H3uAZgJNK/B+MoeQecvQ1alrqPqIP/Xf64H1ik6mD0Z47cez8jkayGA==",
|
| 4101 |
"license": "AGPL",
|
| 4102 |
"dependencies": {
|
| 4103 |
"lodash": "^4.17.21",
|
package.json
CHANGED
|
@@ -25,7 +25,7 @@
|
|
| 25 |
"axios": "^1.3.3",
|
| 26 |
"bcrypt": "^5.1.0",
|
| 27 |
"busboy": "^1.6.0",
|
| 28 |
-
"civkit": "^0.8.4-
|
| 29 |
"core-js": "^3.37.1",
|
| 30 |
"cors": "^2.8.5",
|
| 31 |
"dayjs": "^1.11.9",
|
|
|
|
| 25 |
"axios": "^1.3.3",
|
| 26 |
"bcrypt": "^5.1.0",
|
| 27 |
"busboy": "^1.6.0",
|
| 28 |
+
"civkit": "^0.8.4-5f839a7",
|
| 29 |
"core-js": "^3.37.1",
|
| 30 |
"cors": "^2.8.5",
|
| 31 |
"dayjs": "^1.11.9",
|
src/api/crawler.ts
CHANGED
|
@@ -940,6 +940,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 940 |
this.threadLocal.set('retainImages', opts.retainImages);
|
| 941 |
this.threadLocal.set('noGfm', opts.noGfm);
|
| 942 |
this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
|
|
|
|
|
|
|
|
|
|
| 943 |
|
| 944 |
const crawlOpts: ExtraScrappingOptions = {
|
| 945 |
proxyUrl: opts.proxyUrl,
|
|
|
|
| 940 |
this.threadLocal.set('retainImages', opts.retainImages);
|
| 941 |
this.threadLocal.set('noGfm', opts.noGfm);
|
| 942 |
this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
|
| 943 |
+
if (opts.markdown) {
|
| 944 |
+
this.threadLocal.set('turndownOpts', opts.markdown);
|
| 945 |
+
}
|
| 946 |
|
| 947 |
const crawlOpts: ExtraScrappingOptions = {
|
| 948 |
proxyUrl: opts.proxyUrl,
|
src/dto/crawler-options.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
| 2 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 3 |
import { Context } from '../services/registry';
|
|
|
|
| 4 |
|
| 5 |
export enum CONTENT_FORMAT {
|
| 6 |
CONTENT = 'content',
|
|
@@ -209,6 +210,41 @@ class Viewport extends AutoCastable {
|
|
| 209 |
in: 'header',
|
| 210 |
schema: { type: 'string' }
|
| 211 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
}
|
| 213 |
}
|
| 214 |
}
|
|
@@ -353,6 +389,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 353 |
@Prop()
|
| 354 |
doNotTrack?: number | null;
|
| 355 |
|
|
|
|
|
|
|
|
|
|
| 356 |
static override from(input: any) {
|
| 357 |
const instance = super.from(input) as CrawlerOptions;
|
| 358 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
|
@@ -510,6 +549,10 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 510 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 511 |
}
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
return instance;
|
| 514 |
}
|
| 515 |
|
|
|
|
| 1 |
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
| 2 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 3 |
import { Context } from '../services/registry';
|
| 4 |
+
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
| 5 |
|
| 6 |
export enum CONTENT_FORMAT {
|
| 7 |
CONTENT = 'content',
|
|
|
|
| 210 |
in: 'header',
|
| 211 |
schema: { type: 'string' }
|
| 212 |
},
|
| 213 |
+
'X-Md-Heading-Style': {
|
| 214 |
+
description: 'Heading style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: setext, atx',
|
| 215 |
+
in: 'header',
|
| 216 |
+
schema: { type: 'string' }
|
| 217 |
+
},
|
| 218 |
+
'X-Md-Hr': {
|
| 219 |
+
description: 'Hr text of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).',
|
| 220 |
+
in: 'header',
|
| 221 |
+
schema: { type: 'string' }
|
| 222 |
+
},
|
| 223 |
+
'X-Md-Bullet-List-Marker': {
|
| 224 |
+
description: 'Bullet list marker of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: -, +, *',
|
| 225 |
+
in: 'header',
|
| 226 |
+
schema: { type: 'string' }
|
| 227 |
+
},
|
| 228 |
+
'X-Md-Em-Delimiter': {
|
| 229 |
+
description: 'Em delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: _, *',
|
| 230 |
+
in: 'header',
|
| 231 |
+
schema: { type: 'string' }
|
| 232 |
+
},
|
| 233 |
+
'X-Md-Strong-Delimiter': {
|
| 234 |
+
description: 'Strong delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: **, __',
|
| 235 |
+
in: 'header',
|
| 236 |
+
schema: { type: 'string' }
|
| 237 |
+
},
|
| 238 |
+
'X-Md-Link-Style': {
|
| 239 |
+
description: 'Link style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: inlined, referenced',
|
| 240 |
+
in: 'header',
|
| 241 |
+
schema: { type: 'string' }
|
| 242 |
+
},
|
| 243 |
+
'X-Md-Link-Reference-Style': {
|
| 244 |
+
description: 'Link reference style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: full, collapsed, shortcut',
|
| 245 |
+
in: 'header',
|
| 246 |
+
schema: { type: 'string' }
|
| 247 |
+
},
|
| 248 |
}
|
| 249 |
}
|
| 250 |
}
|
|
|
|
| 389 |
@Prop()
|
| 390 |
doNotTrack?: number | null;
|
| 391 |
|
| 392 |
+
@Prop()
|
| 393 |
+
markdown?: TurnDownTweakableOptions;
|
| 394 |
+
|
| 395 |
static override from(input: any) {
|
| 396 |
const instance = super.from(input) as CrawlerOptions;
|
| 397 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
|
|
|
| 549 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 550 |
}
|
| 551 |
|
| 552 |
+
if (ctx) {
|
| 553 |
+
instance.markdown ??= TurnDownTweakableOptions.fromCtx(ctx);
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
return instance;
|
| 557 |
}
|
| 558 |
|
src/dto/turndown-tweakable-options.ts
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { AutoCastable, Prop } from 'civkit/civ-rpc';
|
| 2 |
+
import {Context} from '../services/registry';
|
| 3 |
+
import _ from 'lodash';
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
export class TurnDownTweakableOptions extends AutoCastable {
|
| 7 |
+
@Prop({
|
| 8 |
+
desc: 'Turndown options > headingStyle',
|
| 9 |
+
type: new Set(['setext', 'atx']),
|
| 10 |
+
})
|
| 11 |
+
headingStyle?: 'setext' | 'atx';
|
| 12 |
+
|
| 13 |
+
@Prop({
|
| 14 |
+
desc: 'Turndown options > hr',
|
| 15 |
+
validate: (v: string) => v.length > 0 && v.length <= 128
|
| 16 |
+
})
|
| 17 |
+
hr?: string;
|
| 18 |
+
|
| 19 |
+
@Prop({
|
| 20 |
+
desc: 'Turndown options > bulletListMarker',
|
| 21 |
+
type: new Set(['-', '+', '*']),
|
| 22 |
+
})
|
| 23 |
+
bulletListMarker?: '-' | '+' | '*';
|
| 24 |
+
|
| 25 |
+
@Prop({
|
| 26 |
+
desc: 'Turndown options > emDelimiter',
|
| 27 |
+
type: new Set(['_', '*']),
|
| 28 |
+
})
|
| 29 |
+
emDelimiter?: '_' | '*';
|
| 30 |
+
|
| 31 |
+
@Prop({
|
| 32 |
+
desc: 'Turndown options > strongDelimiter',
|
| 33 |
+
type: new Set(['__', '**']),
|
| 34 |
+
})
|
| 35 |
+
strongDelimiter?: '__' | '**';
|
| 36 |
+
|
| 37 |
+
@Prop({
|
| 38 |
+
desc: 'Turndown options > linkStyle',
|
| 39 |
+
type: new Set(['inlined', 'referenced']),
|
| 40 |
+
})
|
| 41 |
+
linkStyle?: 'inlined' | 'referenced';
|
| 42 |
+
|
| 43 |
+
@Prop({
|
| 44 |
+
desc: 'Turndown options > linkReferenceStyle',
|
| 45 |
+
type: new Set(['full', 'collapsed', 'shortcut']),
|
| 46 |
+
})
|
| 47 |
+
linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut';
|
| 48 |
+
|
| 49 |
+
static fromCtx(ctx: Context, prefix= 'x-md-') {
|
| 50 |
+
const draft: Record<string, string> = {};
|
| 51 |
+
for (const [k, v] of Object.entries(ctx.headers)) {
|
| 52 |
+
if (k.startsWith(prefix)) {
|
| 53 |
+
const prop = k.slice(prefix.length);
|
| 54 |
+
const sk = _.camelCase(prop);
|
| 55 |
+
draft[sk] = v as string;
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
return this.from(draft);
|
| 60 |
+
}
|
| 61 |
+
}
|
src/services/snapshot-formatter.ts
CHANGED
|
@@ -580,7 +580,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 580 |
customRules?: { [k: string]: Rule; };
|
| 581 |
customKeep?: Filter;
|
| 582 |
}) {
|
|
|
|
| 583 |
const turnDownService = new TurndownService({
|
|
|
|
| 584 |
codeBlockStyle: 'fenced',
|
| 585 |
preformattedCode: true,
|
| 586 |
} as any);
|
|
|
|
| 580 |
customRules?: { [k: string]: Rule; };
|
| 581 |
customKeep?: Filter;
|
| 582 |
}) {
|
| 583 |
+
const turndownOpts = this.threadLocal.get('turndownOpts');
|
| 584 |
const turnDownService = new TurndownService({
|
| 585 |
+
...turndownOpts,
|
| 586 |
codeBlockStyle: 'fenced',
|
| 587 |
preformattedCode: true,
|
| 588 |
} as any);
|