nomagick commited on
Commit
2b29679
·
unverified ·
1 Parent(s): 4400bef

fix: img turndown rules

Browse files
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -1,7 +1,7 @@
1
  import { randomUUID } from 'crypto';
2
  import { container, singleton } from 'tsyringe';
3
  import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
4
- import TurndownService from 'turndown';
5
  import { Logger } from '../shared/services/logger';
6
  import { PageSnapshot } from './puppeteer';
7
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
@@ -181,9 +181,84 @@ export class SnapshotFormatter extends AsyncService {
181
  break;
182
  }
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
185
  let toBeTurnedToMd = jsDomElementOfHTML;
186
- let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
187
  if (!mode.includes('markdown') && snapshot.parsed?.content) {
188
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
189
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
@@ -191,7 +266,7 @@ export class SnapshotFormatter extends AsyncService {
191
 
192
  // If Readability did its job
193
  if (par2.length >= 0.3 * par1.length) {
194
- turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
195
  if (snapshot.parsed.content) {
196
  toBeTurnedToMd = jsDomElementOfParsed;
197
  }
@@ -201,8 +276,7 @@ export class SnapshotFormatter extends AsyncService {
201
  for (const plugin of this.turnDownPlugins) {
202
  turnDownService = turnDownService.use(plugin);
203
  }
204
- const urlToAltMap: { [k: string]: string | undefined; } = {};
205
- const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
206
  // _p is the special suffix for withGeneratedAlt
207
  if (snapshot.imgs?.length && imageRetention?.endsWith('_p')) {
208
  const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
@@ -217,83 +291,13 @@ export class SnapshotFormatter extends AsyncService {
217
 
218
  await Promise.all(tasks);
219
  }
220
- let imgIdx = 0;
221
- turnDownService.addRule('img-retention', {
222
- filter: 'img',
223
- replacement: (_content, node: any) => {
224
- if (imageRetention === 'none') {
225
- return '';
226
- }
227
- const alt = cleanAttribute(node.getAttribute('alt'));
228
-
229
- if (imageRetention === 'alt') {
230
- return alt ? `(Image ${++imgIdx}: ${alt})` : '';
231
- }
232
- let linkPreferredSrc = (node.getAttribute('src') || '').trim();
233
- const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
234
- if (!linkPreferredSrc && maybeSrcSet) {
235
- linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
236
- }
237
- if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
238
- const dataSrc = (node.getAttribute('data-src') || '').trim();
239
- if (dataSrc && !dataSrc.startsWith('data:')) {
240
- linkPreferredSrc = dataSrc;
241
- }
242
- }
243
-
244
- let src;
245
- try {
246
- src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
247
- } catch (_err) {
248
- void 0;
249
- }
250
- if (!src) {
251
- return '';
252
- }
253
- const mapped = urlToAltMap[src];
254
- const imgSerial = ++imgIdx;
255
- const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
256
- idxArr.push(imgSerial);
257
- imageIdxTrack.set(src, idxArr);
258
-
259
- if (mapped) {
260
- imageSummary[src] = mapped || alt;
261
-
262
- if (imageRetention === 'alt_p') {
263
- return `(Image ${imgIdx}: ${mapped || alt})`;
264
- }
265
-
266
- if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
267
- const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
268
- mappedUrl.protocol = 'blob:';
269
-
270
- return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
271
- }
272
-
273
- return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
274
- } else if (imageRetention === 'alt_p') {
275
- return alt ? `(Image ${imgIdx}: ${alt})` : '';
276
- }
277
-
278
- imageSummary[src] = alt || '';
279
-
280
- if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
281
- const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
282
- mappedUrl.protocol = 'blob:';
283
-
284
- return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
285
- }
286
-
287
- return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
288
- }
289
- });
290
 
291
  if (toBeTurnedToMd) {
292
  try {
293
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
294
  } catch (err) {
295
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
296
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
297
  try {
298
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
299
  } catch (err2) {
@@ -311,7 +315,7 @@ export class SnapshotFormatter extends AsyncService {
311
  contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
312
  } catch (err) {
313
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
314
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
315
  try {
316
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
317
  } catch (err2) {
@@ -460,6 +464,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
460
  url?: string | URL;
461
  imgDataUrlToObjectUrl?: boolean;
462
  removeImages?: boolean | 'src';
 
463
  }) {
464
  const turnDownService = new TurndownService({
465
  codeBlockStyle: 'fenced',
@@ -499,6 +504,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
499
  });
500
  }
501
 
 
 
 
 
 
 
502
  turnDownService.addRule('improved-paragraph', {
503
  filter: 'p',
504
  replacement: (innerText) => {
@@ -561,12 +572,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
561
  return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
562
  }
563
  });
564
- turnDownService.addRule('picture', {
565
- filter: 'picture',
566
- replacement: (content, _node) => {
567
- return content;
568
- }
569
- });
570
 
571
  return turnDownService;
572
  }
 
1
  import { randomUUID } from 'crypto';
2
  import { container, singleton } from 'tsyringe';
3
  import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
4
+ import TurndownService, { Rule } from 'turndown';
5
  import { Logger } from '../shared/services/logger';
6
  import { PageSnapshot } from './puppeteer';
7
  import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
 
181
  break;
182
  }
183
 
184
+ const urlToAltMap: { [k: string]: string | undefined; } = {};
185
+ const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
186
+ let imgIdx = 0;
187
+ const customRules = {
188
+ 'img-retention': {
189
+ filter: 'img',
190
+ replacement: (_content: string, node: HTMLElement) => {
191
+ if (imageRetention === 'none') {
192
+ return '';
193
+ }
194
+ const alt = cleanAttribute(node.getAttribute('alt'));
195
+
196
+ if (imageRetention === 'alt') {
197
+ return alt ? `(Image ${++imgIdx}: ${alt})` : '';
198
+ }
199
+ let linkPreferredSrc = (node.getAttribute('src') || '').trim();
200
+ const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
201
+ if (!linkPreferredSrc && maybeSrcSet) {
202
+ linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
203
+ }
204
+ if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
205
+ const dataSrc = (node.getAttribute('data-src') || '').trim();
206
+ if (dataSrc && !dataSrc.startsWith('data:')) {
207
+ linkPreferredSrc = dataSrc;
208
+ }
209
+ }
210
+
211
+ let src;
212
+ try {
213
+ src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
214
+ } catch (_err) {
215
+ void 0;
216
+ }
217
+ if (!src) {
218
+ return '';
219
+ }
220
+ const mapped = urlToAltMap[src];
221
+ const imgSerial = ++imgIdx;
222
+ const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
223
+ idxArr.push(imgSerial);
224
+ imageIdxTrack.set(src, idxArr);
225
+
226
+ if (mapped) {
227
+ imageSummary[src] = mapped || alt;
228
+
229
+ if (imageRetention === 'alt_p') {
230
+ return `(Image ${imgIdx}: ${mapped || alt})`;
231
+ }
232
+
233
+ if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
234
+ const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
235
+ mappedUrl.protocol = 'blob:';
236
+
237
+ return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
238
+ }
239
+
240
+ return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
241
+ } else if (imageRetention === 'alt_p') {
242
+ return alt ? `(Image ${imgIdx}: ${alt})` : '';
243
+ }
244
+
245
+ imageSummary[src] = alt || '';
246
+
247
+ if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
248
+ const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
249
+ mappedUrl.protocol = 'blob:';
250
+
251
+ return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
252
+ }
253
+
254
+ return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
255
+ }
256
+ } as Rule
257
+ };
258
+
259
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
260
  let toBeTurnedToMd = jsDomElementOfHTML;
261
+ let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
262
  if (!mode.includes('markdown') && snapshot.parsed?.content) {
263
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
264
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
 
266
 
267
  // If Readability did its job
268
  if (par2.length >= 0.3 * par1.length) {
269
+ turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
270
  if (snapshot.parsed.content) {
271
  toBeTurnedToMd = jsDomElementOfParsed;
272
  }
 
276
  for (const plugin of this.turnDownPlugins) {
277
  turnDownService = turnDownService.use(plugin);
278
  }
279
+
 
280
  // _p is the special suffix for withGeneratedAlt
281
  if (snapshot.imgs?.length && imageRetention?.endsWith('_p')) {
282
  const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
 
291
 
292
  await Promise.all(tasks);
293
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  if (toBeTurnedToMd) {
296
  try {
297
  contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
298
  } catch (err) {
299
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
300
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
301
  try {
302
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
303
  } catch (err2) {
 
315
  contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
316
  } catch (err) {
317
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
318
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
319
  try {
320
  contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
321
  } catch (err2) {
 
464
  url?: string | URL;
465
  imgDataUrlToObjectUrl?: boolean;
466
  removeImages?: boolean | 'src';
467
+ customRules?: { [k: string]: Rule; };
468
  }) {
469
  const turnDownService = new TurndownService({
470
  codeBlockStyle: 'fenced',
 
504
  });
505
  }
506
 
507
+ if (options?.customRules) {
508
+ for (const [k, v] of Object.entries(options.customRules)) {
509
+ turnDownService.addRule(k, v);
510
+ }
511
+ }
512
+
513
  turnDownService.addRule('improved-paragraph', {
514
  filter: 'p',
515
  replacement: (innerText) => {
 
572
  return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
573
  }
574
  });
 
 
 
 
 
 
575
 
576
  return turnDownService;
577
  }
backend/functions/src/utils/misc.ts CHANGED
@@ -1,3 +1,3 @@
1
- export function cleanAttribute(attribute: string) {
2
  return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
3
  }
 
1
+ export function cleanAttribute(attribute: string | null) {
2
  return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
3
  }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit ce986269f9c5cf8dbd34448b007d1b6bd948bb17
 
1
+ Subproject commit 296fe56d235c08978eda384d8fcddbacdd6f7863