nomagick commited on
Commit
59dcc2d
·
unverified ·
1 Parent(s): ccb4b8a

feat: image retention config

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -687,6 +687,7 @@ export class CrawlerHost extends RPCHost {
687
  if (opts.timeout) {
688
  this.threadLocal.set('timeout', opts.timeout * 1000);
689
  }
 
690
 
691
  const crawlOpts: ExtraScrappingOptions = {
692
  proxyUrl: opts.proxyUrl,
 
687
  if (opts.timeout) {
688
  this.threadLocal.set('timeout', opts.timeout * 1000);
689
  }
690
+ this.threadLocal.set('retainImages', opts.retainImages);
691
 
692
  const crawlOpts: ExtraScrappingOptions = {
693
  proxyUrl: opts.proxyUrl,
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -13,6 +13,9 @@ export enum CONTENT_FORMAT {
13
 
14
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
15
 
 
 
 
16
  @Also({
17
  openapi: {
18
  operation: {
@@ -113,6 +116,17 @@ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
113
  in: 'header',
114
  schema: { type: 'string' }
115
  },
 
 
 
 
 
 
 
 
 
 
 
116
  'X-With-Iframe': {
117
  description: `Enable filling iframe contents into main. (violates standards)`,
118
  in: 'header',
@@ -171,6 +185,9 @@ export class CrawlerOptions extends AutoCastable {
171
  })
172
  withGeneratedAlt!: boolean;
173
 
 
 
 
174
  @Prop({
175
  default: false,
176
  })
@@ -282,6 +299,13 @@ export class CrawlerOptions extends AutoCastable {
282
  if (withImagesSummary !== undefined) {
283
  instance.withImagesSummary = Boolean(withImagesSummary);
284
  }
 
 
 
 
 
 
 
285
  const noCache = ctx?.req.get('x-no-cache');
286
  if (noCache !== undefined) {
287
  instance.noCache = Boolean(noCache);
 
13
 
14
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
15
 
16
+ export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
17
+ const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
18
+
19
  @Also({
20
  openapi: {
21
  operation: {
 
116
  in: 'header',
117
  schema: { type: 'string' }
118
  },
119
+ 'X-Retain-Images': {
120
+ description: `Image retention modes.\n\n` +
121
+ `Supported modes: \n` +
122
+ `- all: all images\n` +
123
+ `- none: no images\n` +
124
+ `- alt: only alt text\n` +
125
+ `- all_p: all images and with generated alt text\n` +
126
+ `- alt_p: only alt text and with generated alt\n\n`,
127
+ in: 'header',
128
+ schema: { type: 'string' }
129
+ },
130
  'X-With-Iframe': {
131
  description: `Enable filling iframe contents into main. (violates standards)`,
132
  in: 'header',
 
185
  })
186
  withGeneratedAlt!: boolean;
187
 
188
+ @Prop({ default: 'all', type: IMAGE_RETENTION_MODE_VALUES })
189
+ retainImages?: typeof IMAGE_RETENTION_MODES[number];
190
+
191
  @Prop({
192
  default: false,
193
  })
 
299
  if (withImagesSummary !== undefined) {
300
  instance.withImagesSummary = Boolean(withImagesSummary);
301
  }
302
+ const retainImages = ctx?.req.get('x-retain-images');
303
+ if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
304
+ instance.retainImages = retainImages as any;
305
+ }
306
+ if (instance.withGeneratedAlt) {
307
+ instance.retainImages = 'all_p';
308
+ }
309
  const noCache = ctx?.req.get('x-no-cache');
310
  if (noCache !== undefined) {
311
  instance.noCache = Boolean(noCache);
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -13,6 +13,7 @@ import { PDFExtractor } from './pdf-extract';
13
  import { cleanAttribute } from '../utils/misc';
14
  import _ from 'lodash';
15
  import { STATUS_CODES } from 'http';
 
16
 
17
 
18
  export interface FormattedPage {
@@ -201,7 +202,9 @@ export class SnapshotFormatter extends AsyncService {
201
  turnDownService = turnDownService.use(plugin);
202
  }
203
  const urlToAltMap: { [k: string]: string | undefined; } = {};
204
- if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
 
 
205
  const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
206
  const r = await this.altTextService.getAltText(x).catch((err: any) => {
207
  this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
@@ -215,9 +218,17 @@ export class SnapshotFormatter extends AsyncService {
215
  await Promise.all(tasks);
216
  }
217
  let imgIdx = 0;
218
- turnDownService.addRule('img-generated-alt', {
219
  filter: 'img',
220
  replacement: (_content, node: any) => {
 
 
 
 
 
 
 
 
221
  let linkPreferredSrc = (node.getAttribute('src') || '').trim();
222
  if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
223
  const dataSrc = (node.getAttribute('data-src') || '').trim();
@@ -232,7 +243,6 @@ export class SnapshotFormatter extends AsyncService {
232
  } catch (_err) {
233
  void 0;
234
  }
235
- const alt = cleanAttribute(node.getAttribute('alt'));
236
  if (!src) {
237
  return '';
238
  }
@@ -245,6 +255,10 @@ export class SnapshotFormatter extends AsyncService {
245
  if (mapped) {
246
  imageSummary[src] = mapped || alt;
247
 
 
 
 
 
248
  if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
249
  const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
250
  mappedUrl.protocol = 'blob:';
@@ -253,6 +267,8 @@ export class SnapshotFormatter extends AsyncService {
253
  }
254
 
255
  return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
 
 
256
  }
257
 
258
  imageSummary[src] = alt || '';
@@ -439,6 +455,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
439
  noRules?: boolean | string,
440
  url?: string | URL;
441
  imgDataUrlToObjectUrl?: boolean;
 
442
  }) {
443
  const turnDownService = new TurndownService({
444
  codeBlockStyle: 'fenced',
 
13
  import { cleanAttribute } from '../utils/misc';
14
  import _ from 'lodash';
15
  import { STATUS_CODES } from 'http';
16
+ import type { CrawlerOptions } from '../dto/scrapping-options';
17
 
18
 
19
  export interface FormattedPage {
 
202
  turnDownService = turnDownService.use(plugin);
203
  }
204
  const urlToAltMap: { [k: string]: string | undefined; } = {};
205
+ const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
206
+ // _p is the special suffix for withGeneratedAlt
207
+ if (snapshot.imgs?.length && imageRetention?.endsWith('_p')) {
208
  const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
209
  const r = await this.altTextService.getAltText(x).catch((err: any) => {
210
  this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
 
218
  await Promise.all(tasks);
219
  }
220
  let imgIdx = 0;
221
+ turnDownService.addRule('img-retention', {
222
  filter: 'img',
223
  replacement: (_content, node: any) => {
224
+ if (imageRetention === 'none') {
225
+ return '';
226
+ }
227
+ const alt = cleanAttribute(node.getAttribute('alt'));
228
+
229
+ if (imageRetention === 'alt') {
230
+ return alt ? `(Image ${++imgIdx}: ${alt})` : '';
231
+ }
232
  let linkPreferredSrc = (node.getAttribute('src') || '').trim();
233
  if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
234
  const dataSrc = (node.getAttribute('data-src') || '').trim();
 
243
  } catch (_err) {
244
  void 0;
245
  }
 
246
  if (!src) {
247
  return '';
248
  }
 
255
  if (mapped) {
256
  imageSummary[src] = mapped || alt;
257
 
258
+ if (imageRetention === 'alt_p') {
259
+ return `(Image ${imgIdx}: ${mapped || alt})`;
260
+ }
261
+
262
  if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
263
  const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
264
  mappedUrl.protocol = 'blob:';
 
267
  }
268
 
269
  return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
270
+ } else if (imageRetention === 'alt_p') {
271
+ return alt ? `(Image ${imgIdx}: ${alt})` : '';
272
  }
273
 
274
  imageSummary[src] = alt || '';
 
455
  noRules?: boolean | string,
456
  url?: string | URL;
457
  imgDataUrlToObjectUrl?: boolean;
458
+ removeImages?: boolean | 'src';
459
  }) {
460
  const turnDownService = new TurndownService({
461
  codeBlockStyle: 'fenced',