Spaces:
Build error
Build error
feat: image retention config
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -687,6 +687,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 687 |
if (opts.timeout) {
|
| 688 |
this.threadLocal.set('timeout', opts.timeout * 1000);
|
| 689 |
}
|
|
|
|
| 690 |
|
| 691 |
const crawlOpts: ExtraScrappingOptions = {
|
| 692 |
proxyUrl: opts.proxyUrl,
|
|
|
|
| 687 |
if (opts.timeout) {
|
| 688 |
this.threadLocal.set('timeout', opts.timeout * 1000);
|
| 689 |
}
|
| 690 |
+
this.threadLocal.set('retainImages', opts.retainImages);
|
| 691 |
|
| 692 |
const crawlOpts: ExtraScrappingOptions = {
|
| 693 |
proxyUrl: opts.proxyUrl,
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -13,6 +13,9 @@ export enum CONTENT_FORMAT {
|
|
| 13 |
|
| 14 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
@Also({
|
| 17 |
openapi: {
|
| 18 |
operation: {
|
|
@@ -113,6 +116,17 @@ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
|
| 113 |
in: 'header',
|
| 114 |
schema: { type: 'string' }
|
| 115 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
'X-With-Iframe': {
|
| 117 |
description: `Enable filling iframe contents into main. (violates standards)`,
|
| 118 |
in: 'header',
|
|
@@ -171,6 +185,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 171 |
})
|
| 172 |
withGeneratedAlt!: boolean;
|
| 173 |
|
|
|
|
|
|
|
|
|
|
| 174 |
@Prop({
|
| 175 |
default: false,
|
| 176 |
})
|
|
@@ -282,6 +299,13 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 282 |
if (withImagesSummary !== undefined) {
|
| 283 |
instance.withImagesSummary = Boolean(withImagesSummary);
|
| 284 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
const noCache = ctx?.req.get('x-no-cache');
|
| 286 |
if (noCache !== undefined) {
|
| 287 |
instance.noCache = Boolean(noCache);
|
|
|
|
| 13 |
|
| 14 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
| 15 |
|
| 16 |
+
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
| 17 |
+
const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
|
| 18 |
+
|
| 19 |
@Also({
|
| 20 |
openapi: {
|
| 21 |
operation: {
|
|
|
|
| 116 |
in: 'header',
|
| 117 |
schema: { type: 'string' }
|
| 118 |
},
|
| 119 |
+
'X-Retain-Images': {
|
| 120 |
+
description: `Image retention modes.\n\n` +
|
| 121 |
+
`Supported modes: \n` +
|
| 122 |
+
`- all: all images\n` +
|
| 123 |
+
`- none: no images\n` +
|
| 124 |
+
`- alt: only alt text\n` +
|
| 125 |
+
`- all_p: all images and with generated alt text\n` +
|
| 126 |
+
`- alt_p: only alt text and with generated alt\n\n`,
|
| 127 |
+
in: 'header',
|
| 128 |
+
schema: { type: 'string' }
|
| 129 |
+
},
|
| 130 |
'X-With-Iframe': {
|
| 131 |
description: `Enable filling iframe contents into main. (violates standards)`,
|
| 132 |
in: 'header',
|
|
|
|
| 185 |
})
|
| 186 |
withGeneratedAlt!: boolean;
|
| 187 |
|
| 188 |
+
@Prop({ default: 'all', type: IMAGE_RETENTION_MODE_VALUES })
|
| 189 |
+
retainImages?: typeof IMAGE_RETENTION_MODES[number];
|
| 190 |
+
|
| 191 |
@Prop({
|
| 192 |
default: false,
|
| 193 |
})
|
|
|
|
| 299 |
if (withImagesSummary !== undefined) {
|
| 300 |
instance.withImagesSummary = Boolean(withImagesSummary);
|
| 301 |
}
|
| 302 |
+
const retainImages = ctx?.req.get('x-retain-images');
|
| 303 |
+
if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
|
| 304 |
+
instance.retainImages = retainImages as any;
|
| 305 |
+
}
|
| 306 |
+
if (instance.withGeneratedAlt) {
|
| 307 |
+
instance.retainImages = 'all_p';
|
| 308 |
+
}
|
| 309 |
const noCache = ctx?.req.get('x-no-cache');
|
| 310 |
if (noCache !== undefined) {
|
| 311 |
instance.noCache = Boolean(noCache);
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -13,6 +13,7 @@ import { PDFExtractor } from './pdf-extract';
|
|
| 13 |
import { cleanAttribute } from '../utils/misc';
|
| 14 |
import _ from 'lodash';
|
| 15 |
import { STATUS_CODES } from 'http';
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
export interface FormattedPage {
|
|
@@ -201,7 +202,9 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 201 |
turnDownService = turnDownService.use(plugin);
|
| 202 |
}
|
| 203 |
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 204 |
-
|
|
|
|
|
|
|
| 205 |
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
| 206 |
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 207 |
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
|
@@ -215,9 +218,17 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 215 |
await Promise.all(tasks);
|
| 216 |
}
|
| 217 |
let imgIdx = 0;
|
| 218 |
-
turnDownService.addRule('img-
|
| 219 |
filter: 'img',
|
| 220 |
replacement: (_content, node: any) => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
| 222 |
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
| 223 |
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
|
@@ -232,7 +243,6 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 232 |
} catch (_err) {
|
| 233 |
void 0;
|
| 234 |
}
|
| 235 |
-
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 236 |
if (!src) {
|
| 237 |
return '';
|
| 238 |
}
|
|
@@ -245,6 +255,10 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 245 |
if (mapped) {
|
| 246 |
imageSummary[src] = mapped || alt;
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 249 |
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 250 |
mappedUrl.protocol = 'blob:';
|
|
@@ -253,6 +267,8 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 253 |
}
|
| 254 |
|
| 255 |
return ``;
|
|
|
|
|
|
|
| 256 |
}
|
| 257 |
|
| 258 |
imageSummary[src] = alt || '';
|
|
@@ -439,6 +455,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 439 |
noRules?: boolean | string,
|
| 440 |
url?: string | URL;
|
| 441 |
imgDataUrlToObjectUrl?: boolean;
|
|
|
|
| 442 |
}) {
|
| 443 |
const turnDownService = new TurndownService({
|
| 444 |
codeBlockStyle: 'fenced',
|
|
|
|
| 13 |
import { cleanAttribute } from '../utils/misc';
|
| 14 |
import _ from 'lodash';
|
| 15 |
import { STATUS_CODES } from 'http';
|
| 16 |
+
import type { CrawlerOptions } from '../dto/scrapping-options';
|
| 17 |
|
| 18 |
|
| 19 |
export interface FormattedPage {
|
|
|
|
| 202 |
turnDownService = turnDownService.use(plugin);
|
| 203 |
}
|
| 204 |
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 205 |
+
const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
|
| 206 |
+
// _p is the special suffix for withGeneratedAlt
|
| 207 |
+
if (snapshot.imgs?.length && imageRetention?.endsWith('_p')) {
|
| 208 |
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
| 209 |
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 210 |
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
|
|
|
| 218 |
await Promise.all(tasks);
|
| 219 |
}
|
| 220 |
let imgIdx = 0;
|
| 221 |
+
turnDownService.addRule('img-retention', {
|
| 222 |
filter: 'img',
|
| 223 |
replacement: (_content, node: any) => {
|
| 224 |
+
if (imageRetention === 'none') {
|
| 225 |
+
return '';
|
| 226 |
+
}
|
| 227 |
+
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 228 |
+
|
| 229 |
+
if (imageRetention === 'alt') {
|
| 230 |
+
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
|
| 231 |
+
}
|
| 232 |
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
| 233 |
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
| 234 |
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
|
|
|
| 243 |
} catch (_err) {
|
| 244 |
void 0;
|
| 245 |
}
|
|
|
|
| 246 |
if (!src) {
|
| 247 |
return '';
|
| 248 |
}
|
|
|
|
| 255 |
if (mapped) {
|
| 256 |
imageSummary[src] = mapped || alt;
|
| 257 |
|
| 258 |
+
if (imageRetention === 'alt_p') {
|
| 259 |
+
return `(Image ${imgIdx}: ${mapped || alt})`;
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 263 |
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 264 |
mappedUrl.protocol = 'blob:';
|
|
|
|
| 267 |
}
|
| 268 |
|
| 269 |
return ``;
|
| 270 |
+
} else if (imageRetention === 'alt_p') {
|
| 271 |
+
return alt ? `(Image ${imgIdx}: ${alt})` : '';
|
| 272 |
}
|
| 273 |
|
| 274 |
imageSummary[src] = alt || '';
|
|
|
|
| 455 |
noRules?: boolean | string,
|
| 456 |
url?: string | URL;
|
| 457 |
imgDataUrlToObjectUrl?: boolean;
|
| 458 |
+
removeImages?: boolean | 'src';
|
| 459 |
}) {
|
| 460 |
const turnDownService = new TurndownService({
|
| 461 |
codeBlockStyle: 'fenced',
|