nomagick commited on
Commit
62fb6cf
·
unverified ·
1 Parent(s): 1084b16

feat: keepImgDataUrl

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -29,6 +29,7 @@ const md5Hasher = new HashManager('md5', 'hex');
29
  export interface ExtraScrappingOptions extends ScrappingOptions {
30
  targetSelector?: string | string[];
31
  removeSelector?: string | string[];
 
32
  }
33
 
34
  export interface FormattedPage {
@@ -135,6 +136,7 @@ export class CrawlerHost extends RPCHost {
135
  getTurndown(options?: {
136
  noRules?: boolean | string,
137
  url?: string | URL;
 
138
  }) {
139
  const turnDownService = new TurndownService({
140
  codeBlockStyle: 'fenced',
@@ -154,6 +156,26 @@ export class CrawlerHost extends RPCHost {
154
  replacement: (innerText) => `${innerText}\n===============\n`
155
  });
156
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  turnDownService.addRule('improved-paragraph', {
158
  filter: 'p',
159
  replacement: (innerText) => {
@@ -317,6 +339,7 @@ export class CrawlerHost extends RPCHost {
317
  }
318
  } as FormattedPage;
319
  }
 
320
 
321
  let contentText = '';
322
  const imageSummary = {} as { [k: string]: string; };
@@ -328,14 +351,14 @@ export class CrawlerHost extends RPCHost {
328
  }
329
 
330
  let toBeTurnedToMd = snapshot.html;
331
- let turnDownService = this.getTurndown({ url: nominalUrl });
332
  if (mode !== 'markdown' && snapshot.parsed?.content) {
333
- const par1 = turnDownService.turndown(toBeTurnedToMd);
334
  const par2 = turnDownService.turndown(snapshot.parsed.content);
335
 
336
  // If Readability did its job
337
  if (par2.length >= 0.3 * par1.length) {
338
- turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
339
  toBeTurnedToMd = snapshot.parsed.content;
340
  }
341
  }
@@ -388,11 +411,25 @@ export class CrawlerHost extends RPCHost {
388
  if (mapped) {
389
  imageSummary[src] = mapped || alt;
390
 
 
 
 
 
 
 
 
391
  return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
392
  }
393
 
394
  imageSummary[src] = alt || '';
395
 
 
 
 
 
 
 
 
396
  return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
397
  }
398
  });
@@ -402,7 +439,7 @@ export class CrawlerHost extends RPCHost {
402
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
403
  } catch (err) {
404
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
405
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
406
  try {
407
  contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
408
  } catch (err2) {
@@ -419,7 +456,7 @@ export class CrawlerHost extends RPCHost {
419
  contentText = turnDownService.turndown(snapshot.html);
420
  } catch (err) {
421
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
422
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
423
  try {
424
  contentText = vanillaTurnDownService.turndown(snapshot.html);
425
  } catch (err2) {
@@ -922,6 +959,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
922
  this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
923
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
924
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
 
925
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
926
  this.threadLocal.set('userAgent', opts.userAgent);
927
  if (opts.timeout) {
 
29
  export interface ExtraScrappingOptions extends ScrappingOptions {
30
  targetSelector?: string | string[];
31
  removeSelector?: string | string[];
32
+ keepImgDataUrl?: boolean;
33
  }
34
 
35
  export interface FormattedPage {
 
136
  getTurndown(options?: {
137
  noRules?: boolean | string,
138
  url?: string | URL;
139
+ imgDataUrlToObjectUrl?: boolean;
140
  }) {
141
  const turnDownService = new TurndownService({
142
  codeBlockStyle: 'fenced',
 
156
  replacement: (innerText) => `${innerText}\n===============\n`
157
  });
158
  }
159
+
160
+ if (options?.imgDataUrlToObjectUrl) {
161
+ turnDownService.addRule('data-url-to-pseudo-object-url', {
162
+ filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
163
+ replacement: (_content, node: any) => {
164
+ const src = (node.getAttribute('src') || '').trim();
165
+ const alt = cleanAttribute(node.getAttribute('alt')) || '';
166
+
167
+ if (options.url) {
168
+ const refUrl = new URL(options.url);
169
+ const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
170
+
171
+ return `![${alt}](${mappedUrl})`;
172
+ }
173
+
174
+ return `![${alt}](blob:${md5Hasher.hash(src)})`;
175
+ }
176
+ });
177
+ }
178
+
179
  turnDownService.addRule('improved-paragraph', {
180
  filter: 'p',
181
  replacement: (innerText) => {
 
339
  }
340
  } as FormattedPage;
341
  }
342
+ const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
343
 
344
  let contentText = '';
345
  const imageSummary = {} as { [k: string]: string; };
 
351
  }
352
 
353
  let toBeTurnedToMd = snapshot.html;
354
+ let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
355
  if (mode !== 'markdown' && snapshot.parsed?.content) {
356
+ const par1 = turnDownService.turndown(snapshot.html);
357
  const par2 = turnDownService.turndown(snapshot.parsed.content);
358
 
359
  // If Readability did its job
360
  if (par2.length >= 0.3 * par1.length) {
361
+ turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
362
  toBeTurnedToMd = snapshot.parsed.content;
363
  }
364
  }
 
411
  if (mapped) {
412
  imageSummary[src] = mapped || alt;
413
 
414
+ if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
415
+ const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
416
+ mappedUrl.protocol = 'blob:';
417
+
418
+ return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
419
+ }
420
+
421
  return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
422
  }
423
 
424
  imageSummary[src] = alt || '';
425
 
426
+ if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
427
+ const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
428
+ mappedUrl.protocol = 'blob:';
429
+
430
+ return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
431
+ }
432
+
433
  return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
434
  }
435
  });
 
439
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
440
  } catch (err) {
441
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
442
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
443
  try {
444
  contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
445
  } catch (err2) {
 
456
  contentText = turnDownService.turndown(snapshot.html);
457
  } catch (err) {
458
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
459
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
460
  try {
461
  contentText = vanillaTurnDownService.turndown(snapshot.html);
462
  } catch (err2) {
 
959
  this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
960
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
961
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
962
+ this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
963
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
964
  this.threadLocal.set('userAgent', opts.userAgent);
965
  if (opts.timeout) {
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -60,6 +60,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
60
  in: 'header',
61
  schema: { type: 'string' }
62
  },
 
 
 
 
 
 
 
63
  'X-Proxy-Url': {
64
  description: `Specifies your custom proxy if you prefer to use one.\n\n` +
65
  `Supported protocols: \n` +
@@ -146,6 +153,11 @@ export class CrawlerOptions extends AutoCastable {
146
  @Prop({ arrayOf: String })
147
  removeSelector?: string | string[];
148
 
 
 
 
 
 
149
  @Prop({
150
  arrayOf: String,
151
  })
@@ -212,6 +224,11 @@ export class CrawlerOptions extends AutoCastable {
212
  const overrideUserAgent = ctx?.req.get('x-user-agent');
213
  instance.userAgent ??= overrideUserAgent;
214
 
 
 
 
 
 
215
  const cookies: CookieParam[] = [];
216
  const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
217
  if (Array.isArray(setCookieHeaders)) {
 
60
  in: 'header',
61
  schema: { type: 'string' }
62
  },
63
+ 'X-Keep-Img-Data-Url': {
64
+ description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` +
65
+ 'Example `X-Keep-Img-Data-Url: true`'
66
+ ,
67
+ in: 'header',
68
+ schema: { type: 'string' }
69
+ },
70
  'X-Proxy-Url': {
71
  description: `Specifies your custom proxy if you prefer to use one.\n\n` +
72
  `Supported protocols: \n` +
 
153
  @Prop({ arrayOf: String })
154
  removeSelector?: string | string[];
155
 
156
+ @Prop({
157
+ default: false,
158
+ })
159
+ keepImgDataUrl!: boolean;
160
+
161
  @Prop({
162
  arrayOf: String,
163
  })
 
224
  const overrideUserAgent = ctx?.req.get('x-user-agent');
225
  instance.userAgent ??= overrideUserAgent;
226
 
227
+ const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
228
+ if (keepImgDataUrl !== undefined) {
229
+ instance.keepImgDataUrl = Boolean(keepImgDataUrl);
230
+ }
231
+
232
  const cookies: CookieParam[] = [];
233
  const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
234
  if (Array.isArray(setCookieHeaders)) {
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 38177e1e3814970613ce6d8fe3e3cf0030d92066
 
1
+ Subproject commit e7216f6ed7aaee80068ffabce78a37ce66b9c50e