nomagick commited on
Commit
6b9e14d
·
unverified ·
1 Parent(s): 2720b69

feat: md options pass though to turndown

Browse files
package-lock.json CHANGED
@@ -17,7 +17,7 @@
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
- "civkit": "^0.8.4-9a05d29",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
@@ -4095,9 +4095,9 @@
4095
  }
4096
  },
4097
  "node_modules/civkit": {
4098
- "version": "0.8.4-9a05d29",
4099
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9a05d29.tgz",
4100
- "integrity": "sha512-NqK2lDSrtVGVLrASGuD6khlS1mDV2Ey/HNufB+Q0loxAf9NGFtkLgoB6WdGuSmA3EQnZFQ5nX3EQLbX5IiLTjQ==",
4101
  "license": "AGPL",
4102
  "dependencies": {
4103
  "lodash": "^4.17.21",
 
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
+ "civkit": "^0.8.4-5f839a7",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
 
4095
  }
4096
  },
4097
  "node_modules/civkit": {
4098
+ "version": "0.8.4-5f839a7",
4099
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-5f839a7.tgz",
4100
+ "integrity": "sha512-wF9Sm0dKBNGTXtueYtmwqreciilEw2+H3uAZgJNK/B+MoeQecvQ1alrqPqIP/Xf64H1ik6mD0Z47cez8jkayGA==",
4101
  "license": "AGPL",
4102
  "dependencies": {
4103
  "lodash": "^4.17.21",
package.json CHANGED
@@ -25,7 +25,7 @@
25
  "axios": "^1.3.3",
26
  "bcrypt": "^5.1.0",
27
  "busboy": "^1.6.0",
28
- "civkit": "^0.8.4-9a05d29",
29
  "core-js": "^3.37.1",
30
  "cors": "^2.8.5",
31
  "dayjs": "^1.11.9",
 
25
  "axios": "^1.3.3",
26
  "bcrypt": "^5.1.0",
27
  "busboy": "^1.6.0",
28
+ "civkit": "^0.8.4-5f839a7",
29
  "core-js": "^3.37.1",
30
  "cors": "^2.8.5",
31
  "dayjs": "^1.11.9",
src/api/crawler.ts CHANGED
@@ -940,6 +940,9 @@ export class CrawlerHost extends RPCHost {
940
  this.threadLocal.set('retainImages', opts.retainImages);
941
  this.threadLocal.set('noGfm', opts.noGfm);
942
  this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
 
 
 
943
 
944
  const crawlOpts: ExtraScrappingOptions = {
945
  proxyUrl: opts.proxyUrl,
 
940
  this.threadLocal.set('retainImages', opts.retainImages);
941
  this.threadLocal.set('noGfm', opts.noGfm);
942
  this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
943
+ if (opts.markdown) {
944
+ this.threadLocal.set('turndownOpts', opts.markdown);
945
+ }
946
 
947
  const crawlOpts: ExtraScrappingOptions = {
948
  proxyUrl: opts.proxyUrl,
src/dto/crawler-options.ts CHANGED
@@ -1,6 +1,7 @@
1
  import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
3
  import { Context } from '../services/registry';
 
4
 
5
  export enum CONTENT_FORMAT {
6
  CONTENT = 'content',
@@ -209,6 +210,41 @@ class Viewport extends AutoCastable {
209
  in: 'header',
210
  schema: { type: 'string' }
211
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  }
213
  }
214
  }
@@ -353,6 +389,9 @@ export class CrawlerOptions extends AutoCastable {
353
  @Prop()
354
  doNotTrack?: number | null;
355
 
 
 
 
356
  static override from(input: any) {
357
  const instance = super.from(input) as CrawlerOptions;
358
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
@@ -510,6 +549,10 @@ export class CrawlerOptions extends AutoCastable {
510
  instance.cacheTolerance = instance.cacheTolerance * 1000;
511
  }
512
 
 
 
 
 
513
  return instance;
514
  }
515
 
 
1
  import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
3
  import { Context } from '../services/registry';
4
+ import { TurnDownTweakableOptions } from './turndown-tweakable-options';
5
 
6
  export enum CONTENT_FORMAT {
7
  CONTENT = 'content',
 
210
  in: 'header',
211
  schema: { type: 'string' }
212
  },
213
+ 'X-Md-Heading-Style': {
214
+ description: 'Heading style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: setext, atx',
215
+ in: 'header',
216
+ schema: { type: 'string' }
217
+ },
218
+ 'X-Md-Hr': {
219
+ description: 'Hr text of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).',
220
+ in: 'header',
221
+ schema: { type: 'string' }
222
+ },
223
+ 'X-Md-Bullet-List-Marker': {
224
+ description: 'Bullet list marker of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: -, +, *',
225
+ in: 'header',
226
+ schema: { type: 'string' }
227
+ },
228
+ 'X-Md-Em-Delimiter': {
229
+ description: 'Em delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: _, *',
230
+ in: 'header',
231
+ schema: { type: 'string' }
232
+ },
233
+ 'X-Md-Strong-Delimiter': {
234
+ description: 'Strong delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: **, __',
235
+ in: 'header',
236
+ schema: { type: 'string' }
237
+ },
238
+ 'X-Md-Link-Style': {
239
+ description: 'Link style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: inlined, referenced',
240
+ in: 'header',
241
+ schema: { type: 'string' }
242
+ },
243
+ 'X-Md-Link-Reference-Style': {
244
+ description: 'Link reference style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: full, collapsed, shortcut',
245
+ in: 'header',
246
+ schema: { type: 'string' }
247
+ },
248
  }
249
  }
250
  }
 
389
  @Prop()
390
  doNotTrack?: number | null;
391
 
392
+ @Prop()
393
+ markdown?: TurnDownTweakableOptions;
394
+
395
  static override from(input: any) {
396
  const instance = super.from(input) as CrawlerOptions;
397
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
 
549
  instance.cacheTolerance = instance.cacheTolerance * 1000;
550
  }
551
 
552
+ if (ctx) {
553
+ instance.markdown ??= TurnDownTweakableOptions.fromCtx(ctx);
554
+ }
555
+
556
  return instance;
557
  }
558
 
src/dto/turndown-tweakable-options.ts ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { AutoCastable, Prop } from 'civkit/civ-rpc';
2
+ import {Context} from '../services/registry';
3
+ import _ from 'lodash';
4
+
5
+
6
+ export class TurnDownTweakableOptions extends AutoCastable {
7
+ @Prop({
8
+ desc: 'Turndown options > headingStyle',
9
+ type: new Set(['setext', 'atx']),
10
+ })
11
+ headingStyle?: 'setext' | 'atx';
12
+
13
+ @Prop({
14
+ desc: 'Turndown options > hr',
15
+ validate: (v: string) => v.length > 0 && v.length <= 128
16
+ })
17
+ hr?: string;
18
+
19
+ @Prop({
20
+ desc: 'Turndown options > bulletListMarker',
21
+ type: new Set(['-', '+', '*']),
22
+ })
23
+ bulletListMarker?: '-' | '+' | '*';
24
+
25
+ @Prop({
26
+ desc: 'Turndown options > emDelimiter',
27
+ type: new Set(['_', '*']),
28
+ })
29
+ emDelimiter?: '_' | '*';
30
+
31
+ @Prop({
32
+ desc: 'Turndown options > strongDelimiter',
33
+ type: new Set(['__', '**']),
34
+ })
35
+ strongDelimiter?: '__' | '**';
36
+
37
+ @Prop({
38
+ desc: 'Turndown options > linkStyle',
39
+ type: new Set(['inlined', 'referenced']),
40
+ })
41
+ linkStyle?: 'inlined' | 'referenced';
42
+
43
+ @Prop({
44
+ desc: 'Turndown options > linkReferenceStyle',
45
+ type: new Set(['full', 'collapsed', 'shortcut']),
46
+ })
47
+ linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut';
48
+
49
+ static fromCtx(ctx: Context, prefix= 'x-md-') {
50
+ const draft: Record<string, string> = {};
51
+ for (const [k, v] of Object.entries(ctx.headers)) {
52
+ if (k.startsWith(prefix)) {
53
+ const prop = k.slice(prefix.length);
54
+ const sk = _.camelCase(prop);
55
+ draft[sk] = v as string;
56
+ }
57
+ }
58
+
59
+ return this.from(draft);
60
+ }
61
+ }
src/services/snapshot-formatter.ts CHANGED
@@ -580,7 +580,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
580
  customRules?: { [k: string]: Rule; };
581
  customKeep?: Filter;
582
  }) {
 
583
  const turnDownService = new TurndownService({
 
584
  codeBlockStyle: 'fenced',
585
  preformattedCode: true,
586
  } as any);
 
580
  customRules?: { [k: string]: Rule; };
581
  customKeep?: Filter;
582
  }) {
583
+ const turndownOpts = this.threadLocal.get('turndownOpts');
584
  const turnDownService = new TurndownService({
585
+ ...turndownOpts,
586
  codeBlockStyle: 'fenced',
587
  preformattedCode: true,
588
  } as any);