Spaces:
Build error
Build error
feat: keepImgDataUrl
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -29,6 +29,7 @@ const md5Hasher = new HashManager('md5', 'hex');
|
|
| 29 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 30 |
targetSelector?: string | string[];
|
| 31 |
removeSelector?: string | string[];
|
|
|
|
| 32 |
}
|
| 33 |
|
| 34 |
export interface FormattedPage {
|
|
@@ -135,6 +136,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 135 |
getTurndown(options?: {
|
| 136 |
noRules?: boolean | string,
|
| 137 |
url?: string | URL;
|
|
|
|
| 138 |
}) {
|
| 139 |
const turnDownService = new TurndownService({
|
| 140 |
codeBlockStyle: 'fenced',
|
|
@@ -154,6 +156,26 @@ export class CrawlerHost extends RPCHost {
|
|
| 154 |
replacement: (innerText) => `${innerText}\n===============\n`
|
| 155 |
});
|
| 156 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
turnDownService.addRule('improved-paragraph', {
|
| 158 |
filter: 'p',
|
| 159 |
replacement: (innerText) => {
|
|
@@ -317,6 +339,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 317 |
}
|
| 318 |
} as FormattedPage;
|
| 319 |
}
|
|
|
|
| 320 |
|
| 321 |
let contentText = '';
|
| 322 |
const imageSummary = {} as { [k: string]: string; };
|
|
@@ -328,14 +351,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 328 |
}
|
| 329 |
|
| 330 |
let toBeTurnedToMd = snapshot.html;
|
| 331 |
-
let turnDownService = this.getTurndown({ url: nominalUrl });
|
| 332 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 333 |
-
const par1 = turnDownService.turndown(
|
| 334 |
const par2 = turnDownService.turndown(snapshot.parsed.content);
|
| 335 |
|
| 336 |
// If Readability did its job
|
| 337 |
if (par2.length >= 0.3 * par1.length) {
|
| 338 |
-
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
|
| 339 |
toBeTurnedToMd = snapshot.parsed.content;
|
| 340 |
}
|
| 341 |
}
|
|
@@ -388,11 +411,25 @@ export class CrawlerHost extends RPCHost {
|
|
| 388 |
if (mapped) {
|
| 389 |
imageSummary[src] = mapped || alt;
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
return ``;
|
| 392 |
}
|
| 393 |
|
| 394 |
imageSummary[src] = alt || '';
|
| 395 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
return alt ? `` : ``;
|
| 397 |
}
|
| 398 |
});
|
|
@@ -402,7 +439,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 402 |
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 403 |
} catch (err) {
|
| 404 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 405 |
-
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
| 406 |
try {
|
| 407 |
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
| 408 |
} catch (err2) {
|
|
@@ -419,7 +456,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 419 |
contentText = turnDownService.turndown(snapshot.html);
|
| 420 |
} catch (err) {
|
| 421 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 422 |
-
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
| 423 |
try {
|
| 424 |
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
| 425 |
} catch (err2) {
|
|
@@ -922,6 +959,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 922 |
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
| 923 |
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
| 924 |
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
|
|
|
| 925 |
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
| 926 |
this.threadLocal.set('userAgent', opts.userAgent);
|
| 927 |
if (opts.timeout) {
|
|
|
|
| 29 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 30 |
targetSelector?: string | string[];
|
| 31 |
removeSelector?: string | string[];
|
| 32 |
+
keepImgDataUrl?: boolean;
|
| 33 |
}
|
| 34 |
|
| 35 |
export interface FormattedPage {
|
|
|
|
| 136 |
getTurndown(options?: {
|
| 137 |
noRules?: boolean | string,
|
| 138 |
url?: string | URL;
|
| 139 |
+
imgDataUrlToObjectUrl?: boolean;
|
| 140 |
}) {
|
| 141 |
const turnDownService = new TurndownService({
|
| 142 |
codeBlockStyle: 'fenced',
|
|
|
|
| 156 |
replacement: (innerText) => `${innerText}\n===============\n`
|
| 157 |
});
|
| 158 |
}
|
| 159 |
+
|
| 160 |
+
if (options?.imgDataUrlToObjectUrl) {
|
| 161 |
+
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
| 162 |
+
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
| 163 |
+
replacement: (_content, node: any) => {
|
| 164 |
+
const src = (node.getAttribute('src') || '').trim();
|
| 165 |
+
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
| 166 |
+
|
| 167 |
+
if (options.url) {
|
| 168 |
+
const refUrl = new URL(options.url);
|
| 169 |
+
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
| 170 |
+
|
| 171 |
+
return ``;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
return `})`;
|
| 175 |
+
}
|
| 176 |
+
});
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
turnDownService.addRule('improved-paragraph', {
|
| 180 |
filter: 'p',
|
| 181 |
replacement: (innerText) => {
|
|
|
|
| 339 |
}
|
| 340 |
} as FormattedPage;
|
| 341 |
}
|
| 342 |
+
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
| 343 |
|
| 344 |
let contentText = '';
|
| 345 |
const imageSummary = {} as { [k: string]: string; };
|
|
|
|
| 351 |
}
|
| 352 |
|
| 353 |
let toBeTurnedToMd = snapshot.html;
|
| 354 |
+
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
| 355 |
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 356 |
+
const par1 = turnDownService.turndown(snapshot.html);
|
| 357 |
const par2 = turnDownService.turndown(snapshot.parsed.content);
|
| 358 |
|
| 359 |
// If Readability did its job
|
| 360 |
if (par2.length >= 0.3 * par1.length) {
|
| 361 |
+
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
|
| 362 |
toBeTurnedToMd = snapshot.parsed.content;
|
| 363 |
}
|
| 364 |
}
|
|
|
|
| 411 |
if (mapped) {
|
| 412 |
imageSummary[src] = mapped || alt;
|
| 413 |
|
| 414 |
+
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 415 |
+
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 416 |
+
mappedUrl.protocol = 'blob:';
|
| 417 |
+
|
| 418 |
+
return ``;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
return ``;
|
| 422 |
}
|
| 423 |
|
| 424 |
imageSummary[src] = alt || '';
|
| 425 |
|
| 426 |
+
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 427 |
+
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 428 |
+
mappedUrl.protocol = 'blob:';
|
| 429 |
+
|
| 430 |
+
return alt ? `` : ``;
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
return alt ? `` : ``;
|
| 434 |
}
|
| 435 |
});
|
|
|
|
| 439 |
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 440 |
} catch (err) {
|
| 441 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 442 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
| 443 |
try {
|
| 444 |
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
| 445 |
} catch (err2) {
|
|
|
|
| 456 |
contentText = turnDownService.turndown(snapshot.html);
|
| 457 |
} catch (err) {
|
| 458 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 459 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
| 460 |
try {
|
| 461 |
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
| 462 |
} catch (err2) {
|
|
|
|
| 959 |
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
| 960 |
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
| 961 |
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
| 962 |
+
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
| 963 |
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
| 964 |
this.threadLocal.set('userAgent', opts.userAgent);
|
| 965 |
if (opts.timeout) {
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -60,6 +60,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
| 60 |
in: 'header',
|
| 61 |
schema: { type: 'string' }
|
| 62 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
'X-Proxy-Url': {
|
| 64 |
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
| 65 |
`Supported protocols: \n` +
|
|
@@ -146,6 +153,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 146 |
@Prop({ arrayOf: String })
|
| 147 |
removeSelector?: string | string[];
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
@Prop({
|
| 150 |
arrayOf: String,
|
| 151 |
})
|
|
@@ -212,6 +224,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 212 |
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
| 213 |
instance.userAgent ??= overrideUserAgent;
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
const cookies: CookieParam[] = [];
|
| 216 |
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
| 217 |
if (Array.isArray(setCookieHeaders)) {
|
|
|
|
| 60 |
in: 'header',
|
| 61 |
schema: { type: 'string' }
|
| 62 |
},
|
| 63 |
+
'X-Keep-Img-Data-Url': {
|
| 64 |
+
description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` +
|
| 65 |
+
'Example `X-Keep-Img-Data-Url: true`'
|
| 66 |
+
,
|
| 67 |
+
in: 'header',
|
| 68 |
+
schema: { type: 'string' }
|
| 69 |
+
},
|
| 70 |
'X-Proxy-Url': {
|
| 71 |
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
| 72 |
`Supported protocols: \n` +
|
|
|
|
| 153 |
@Prop({ arrayOf: String })
|
| 154 |
removeSelector?: string | string[];
|
| 155 |
|
| 156 |
+
@Prop({
|
| 157 |
+
default: false,
|
| 158 |
+
})
|
| 159 |
+
keepImgDataUrl!: boolean;
|
| 160 |
+
|
| 161 |
@Prop({
|
| 162 |
arrayOf: String,
|
| 163 |
})
|
|
|
|
| 224 |
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
| 225 |
instance.userAgent ??= overrideUserAgent;
|
| 226 |
|
| 227 |
+
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
|
| 228 |
+
if (keepImgDataUrl !== undefined) {
|
| 229 |
+
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
const cookies: CookieParam[] = [];
|
| 233 |
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
| 234 |
if (Array.isArray(setCookieHeaders)) {
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit e7216f6ed7aaee80068ffabce78a37ce66b9c50e
|