nomagick commited on
Commit
3bb7315
·
unverified ·
1 Parent(s): 2a30fce

fix: generated alt

Browse files
package-lock.json CHANGED
@@ -17,7 +17,7 @@
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
- "civkit": "^0.9.0-f7b0ca7",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
@@ -26,6 +26,7 @@
26
  "firebase-functions": "^6.1.1",
27
  "htmlparser2": "^9.0.0",
28
  "jose": "^5.1.0",
 
29
  "langdetect": "^0.2.1",
30
  "linkedom": "^0.18.4",
31
  "lru-cache": "^11.0.2",
@@ -41,6 +42,7 @@
41
  "set-cookie-parser": "^2.6.0",
42
  "simple-zstd": "^1.4.2",
43
  "stripe": "^11.11.0",
 
44
  "tiktoken": "^1.0.16",
45
  "tld-extract": "^2.1.0",
46
  "turndown": "^7.1.3",
@@ -62,7 +64,6 @@
62
  "eslint-config-google": "^0.14.0",
63
  "eslint-plugin-import": "^2.25.4",
64
  "firebase-functions-test": "^3.0.0",
65
- "koa": "^2.16.0",
66
  "pino-pretty": "^13.0.0",
67
  "replicate": "^0.16.1",
68
  "typescript": "^5.5.4"
@@ -4002,9 +4003,9 @@
4002
  }
4003
  },
4004
  "node_modules/civkit": {
4005
- "version": "0.9.0-f7b0ca7",
4006
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-f7b0ca7.tgz",
4007
- "integrity": "sha512-WjF0zRY83Ewvx4fGs1O0PQD2Oyc/RlKCVGiO/LHdwEFwfldTqDE3XWdWv+brZ2GvsIsVVKVa+bEGP0SwJfrRXA==",
4008
  "license": "AGPL",
4009
  "dependencies": {
4010
  "lodash": "^4.17.21",
@@ -4022,7 +4023,7 @@
4022
  "iconv-lite": "^0.6.3",
4023
  "js-yaml": "^4.1.0",
4024
  "jschardet": "^3.0.0",
4025
- "koa": "^2.14.2",
4026
  "koa-bodyparser": "^4.4.0",
4027
  "koa-compose": "^4.1.0",
4028
  "libmagic-ffi": "^0.1.4",
@@ -11931,6 +11932,12 @@
11931
  "url": "https://github.com/sponsors/ljharb"
11932
  }
11933
  },
 
 
 
 
 
 
11934
  "node_modules/tar": {
11935
  "version": "6.2.1",
11936
  "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
 
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
+ "civkit": "^0.9.0-848ef4e",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
 
26
  "firebase-functions": "^6.1.1",
27
  "htmlparser2": "^9.0.0",
28
  "jose": "^5.1.0",
29
+ "koa": "^2.16.0",
30
  "langdetect": "^0.2.1",
31
  "linkedom": "^0.18.4",
32
  "lru-cache": "^11.0.2",
 
42
  "set-cookie-parser": "^2.6.0",
43
  "simple-zstd": "^1.4.2",
44
  "stripe": "^11.11.0",
45
+ "svg2png-wasm": "^1.4.1",
46
  "tiktoken": "^1.0.16",
47
  "tld-extract": "^2.1.0",
48
  "turndown": "^7.1.3",
 
64
  "eslint-config-google": "^0.14.0",
65
  "eslint-plugin-import": "^2.25.4",
66
  "firebase-functions-test": "^3.0.0",
 
67
  "pino-pretty": "^13.0.0",
68
  "replicate": "^0.16.1",
69
  "typescript": "^5.5.4"
 
4003
  }
4004
  },
4005
  "node_modules/civkit": {
4006
+ "version": "0.9.0-848ef4e",
4007
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-848ef4e.tgz",
4008
+ "integrity": "sha512-yxk5AKaiZSN4ntlwybVHYgUer402CSw06KzN7wvfaYra9evZkZ7MiFHGULqMnY7657k3CH0WV4n6jGfRj1Vpvw==",
4009
  "license": "AGPL",
4010
  "dependencies": {
4011
  "lodash": "^4.17.21",
 
4023
  "iconv-lite": "^0.6.3",
4024
  "js-yaml": "^4.1.0",
4025
  "jschardet": "^3.0.0",
4026
+ "koa": "^2.15.4",
4027
  "koa-bodyparser": "^4.4.0",
4028
  "koa-compose": "^4.1.0",
4029
  "libmagic-ffi": "^0.1.4",
 
11932
  "url": "https://github.com/sponsors/ljharb"
11933
  }
11934
  },
11935
+ "node_modules/svg2png-wasm": {
11936
+ "version": "1.4.1",
11937
+ "resolved": "https://registry.npmjs.org/svg2png-wasm/-/svg2png-wasm-1.4.1.tgz",
11938
+ "integrity": "sha512-ZFy1NtwZVAsslaTQoI+/QqX2sg0vjmgJ/jGAuLZZvYcRlndI54hLPiwLC9JzXlFBerfxN5JiS7kpEUG0mrXS3Q==",
11939
+ "license": "MIT"
11940
+ },
11941
  "node_modules/tar": {
11942
  "version": "6.2.1",
11943
  "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
package.json CHANGED
@@ -26,7 +26,7 @@
26
  "axios": "^1.3.3",
27
  "bcrypt": "^5.1.0",
28
  "busboy": "^1.6.0",
29
- "civkit": "^0.9.0-f7b0ca7",
30
  "core-js": "^3.37.1",
31
  "cors": "^2.8.5",
32
  "dayjs": "^1.11.9",
@@ -35,6 +35,7 @@
35
  "firebase-functions": "^6.1.1",
36
  "htmlparser2": "^9.0.0",
37
  "jose": "^5.1.0",
 
38
  "langdetect": "^0.2.1",
39
  "linkedom": "^0.18.4",
40
  "lru-cache": "^11.0.2",
@@ -50,6 +51,7 @@
50
  "set-cookie-parser": "^2.6.0",
51
  "simple-zstd": "^1.4.2",
52
  "stripe": "^11.11.0",
 
53
  "tiktoken": "^1.0.16",
54
  "tld-extract": "^2.1.0",
55
  "turndown": "^7.1.3",
@@ -71,7 +73,6 @@
71
  "eslint-config-google": "^0.14.0",
72
  "eslint-plugin-import": "^2.25.4",
73
  "firebase-functions-test": "^3.0.0",
74
- "koa": "^2.16.0",
75
  "pino-pretty": "^13.0.0",
76
  "replicate": "^0.16.1",
77
  "typescript": "^5.5.4"
 
26
  "axios": "^1.3.3",
27
  "bcrypt": "^5.1.0",
28
  "busboy": "^1.6.0",
29
+ "civkit": "^0.9.0-848ef4e",
30
  "core-js": "^3.37.1",
31
  "cors": "^2.8.5",
32
  "dayjs": "^1.11.9",
 
35
  "firebase-functions": "^6.1.1",
36
  "htmlparser2": "^9.0.0",
37
  "jose": "^5.1.0",
38
+ "koa": "^2.16.0",
39
  "langdetect": "^0.2.1",
40
  "linkedom": "^0.18.4",
41
  "lru-cache": "^11.0.2",
 
51
  "set-cookie-parser": "^2.6.0",
52
  "simple-zstd": "^1.4.2",
53
  "stripe": "^11.11.0",
54
+ "svg2png-wasm": "^1.4.1",
55
  "tiktoken": "^1.0.16",
56
  "tld-extract": "^2.1.0",
57
  "turndown": "^7.1.3",
 
73
  "eslint-config-google": "^0.14.0",
74
  "eslint-plugin-import": "^2.25.4",
75
  "firebase-functions-test": "^3.0.0",
 
76
  "pino-pretty": "^13.0.0",
77
  "replicate": "^0.16.1",
78
  "typescript": "^5.5.4"
src/services/alt-text.ts CHANGED
@@ -1,7 +1,7 @@
1
  import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
2
  import { singleton } from 'tsyringe';
3
  import { GlobalLogger } from './logger';
4
- import { CanvasService } from '../shared/services/canvas';
5
  import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
6
  import { ImgBrief } from './puppeteer';
7
  import { ImgAlt } from '../db/img-alt';
@@ -32,13 +32,20 @@ export class AltTextService extends AsyncService {
32
  async caption(url: string) {
33
  try {
34
  const img = await this.canvasService.loadImage(url);
 
 
 
 
35
  const resized = this.canvasService.fitImageToSquareBox(img, 1024);
36
  const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
37
 
38
- const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', {
 
 
 
39
  image: exported,
40
- prompt: `Yield a concise image caption sentence in third person.`,
41
- system: 'You are BLIP2, an image caption model.',
42
  });
43
 
44
  return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
@@ -73,7 +80,7 @@ export class AltTextService extends AsyncService {
73
 
74
  if (this.asyncLocalContext.ctx.DNT) {
75
  // Don't cache alt text if DNT is set
76
- return;
77
  }
78
 
79
  // Don't try again until the next day
 
1
  import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
2
  import { singleton } from 'tsyringe';
3
  import { GlobalLogger } from './logger';
4
+ import { CanvasService } from './canvas';
5
  import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
6
  import { ImgBrief } from './puppeteer';
7
  import { ImgAlt } from '../db/img-alt';
 
32
  async caption(url: string) {
33
  try {
34
  const img = await this.canvasService.loadImage(url);
35
+ const contentTypeHint = Reflect.get(img, 'contentType');
36
+ if (Math.min(img.naturalHeight, img.naturalWidth) < 64) {
37
+ throw new AssertionFailureError({ message: `Image is too small to generate alt text for url ${url}` });
38
+ }
39
  const resized = this.canvasService.fitImageToSquareBox(img, 1024);
40
  const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
41
 
42
+ const svgHint = contentTypeHint.includes('svg') ? `Beware this image is a SVG rendered on a gray background, the gray background is not part of the image.\n\n` : '';
43
+ const svgSystemHint = contentTypeHint.includes('svg') ? ` Sometimes the system renders SVG on a gray background. When this happens, you must not include the gray background in the description.` : '';
44
+
45
+ const r = await this.imageInterrogator.interrogate('vertex-gemini-2.0-flash', {
46
  image: exported,
47
+ prompt: `${svgHint}Give a concise image caption descriptive sentence in third person. Start directly with the description.`,
48
+ system: `You are BLIP2, an image caption model. You will generate Alt Text (in web pages) for any image for a11y purposes. You must not start with "This image is sth...", instead, start direly with "sth..."${svgSystemHint}`,
49
  });
50
 
51
  return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
 
80
 
81
  if (this.asyncLocalContext.ctx.DNT) {
82
  // Don't cache alt text if DNT is set
83
+ return generatedCaption;
84
  }
85
 
86
  // Don't try again until the next day
src/services/canvas.ts ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { singleton, container } from 'tsyringe';
2
+ import { AsyncService, mimeOf, ParamValidationError, SubmittedDataMalformedError, /* downloadFile */ } from 'civkit';
3
+ import { readFile } from 'fs/promises';
4
+
5
+ import type canvas from '@napi-rs/canvas';
6
+ export type { Canvas, Image } from '@napi-rs/canvas';
7
+
8
+ import { GlobalLogger } from './logger';
9
+ import { TempFileManager } from './temp-file';
10
+
11
+ import { isMainThread } from 'worker_threads';
12
+ import type { svg2png } from 'svg2png-wasm' with { 'resolution-mode': 'import' };
13
+ import path from 'path';
14
+ import { Threaded } from './threaded';
15
+
16
+ const downloadFile = async (uri: string) => {
17
+ const resp = await fetch(uri);
18
+ if (!(resp.ok && resp.body)) {
19
+ throw new Error(`Unexpected response ${resp.statusText}`);
20
+ }
21
+ const contentLength = parseInt(resp.headers.get('content-length') || '0');
22
+ if (contentLength > 1024 * 1024 * 100) {
23
+ throw new Error('File too large');
24
+ }
25
+ const buff = await resp.arrayBuffer();
26
+
27
+ return { buff, contentType: resp.headers.get('content-type') };
28
+ };
29
+
30
+ @singleton()
31
+ export class CanvasService extends AsyncService {
32
+
33
+ logger = this.globalLogger.child({ service: this.constructor.name });
34
+ svg2png!: typeof svg2png;
35
+ canvas!: typeof canvas;
36
+
37
+ constructor(
38
+ protected temp: TempFileManager,
39
+ protected globalLogger: GlobalLogger,
40
+ ) {
41
+ super(...arguments);
42
+ }
43
+
44
+ override async init() {
45
+ await this.dependencyReady();
46
+ if (!isMainThread) {
47
+ const { createSvg2png, initialize } = require('svg2png-wasm');
48
+ const wasmBuff = await readFile(path.resolve(path.dirname(require.resolve('svg2png-wasm')), '../svg2png_wasm_bg.wasm'));
49
+ const fontBuff = await readFile(path.resolve(__dirname, '../../licensed/SourceHanSansSC-Regular.otf'));
50
+ await initialize(wasmBuff);
51
+ this.svg2png = createSvg2png({
52
+ fonts: [Uint8Array.from(fontBuff)],
53
+ defaultFontFamily: {
54
+ serifFamily: 'Source Han Sans SC',
55
+ sansSerifFamily: 'Source Han Sans SC',
56
+ cursiveFamily: 'Source Han Sans SC',
57
+ fantasyFamily: 'Source Han Sans SC',
58
+ monospaceFamily: 'Source Han Sans SC',
59
+ }
60
+ });
61
+ }
62
+ this.canvas = require('@napi-rs/canvas');
63
+
64
+ this.emit('ready');
65
+ }
66
+
67
+ @Threaded()
68
+ async renderSvgToPng(svgContent: string,) {
69
+ return this.svg2png(svgContent, { backgroundColor: '#D3D3D3' });
70
+ }
71
+
72
+ protected async _loadImage(input: string | Buffer) {
73
+ let buff;
74
+ let contentType;
75
+ do {
76
+ if (typeof input === 'string') {
77
+ if (input.startsWith('data:')) {
78
+ const firstComma = input.indexOf(',');
79
+ const header = input.slice(0, firstComma);
80
+ const data = input.slice(firstComma + 1);
81
+ const encoding = header.split(';')[1];
82
+ contentType = header.split(';')[0].split(':')[1];
83
+ if (encoding?.startsWith('base64')) {
84
+ buff = Buffer.from(data, 'base64');
85
+ } else {
86
+ buff = Buffer.from(decodeURIComponent(data), 'utf-8');
87
+ }
88
+ break;
89
+ }
90
+ if (input.startsWith('http')) {
91
+ const r = await downloadFile(input);
92
+ buff = Buffer.from(r.buff);
93
+ contentType = r.contentType;
94
+ break;
95
+ }
96
+ }
97
+ if (Buffer.isBuffer(input)) {
98
+ buff = input;
99
+ const mime = await mimeOf(buff);
100
+ contentType = `${mime.mediaType}/${mime.subType}`;
101
+ break;
102
+ }
103
+ throw new ParamValidationError('Invalid input');
104
+ } while (false);
105
+
106
+ if (!buff) {
107
+ throw new ParamValidationError('Invalid input');
108
+ }
109
+
110
+ if (contentType?.includes('svg')) {
111
+ buff = await this.renderSvgToPng(buff.toString('utf-8'));
112
+ }
113
+
114
+ const img = await this.canvas.loadImage(buff);
115
+ Reflect.set(img, 'contentType', contentType);
116
+
117
+ return img;
118
+ }
119
+
120
+ async loadImage(uri: string | Buffer) {
121
+ const t0 = Date.now();
122
+ try {
123
+ const theImage = await this._loadImage(uri);
124
+ const t1 = Date.now();
125
+ this.logger.debug(`Image loaded in ${t1 - t0}ms`);
126
+
127
+ return theImage;
128
+ } catch (err: any) {
129
+ if (err?.message?.includes('Unsupported image type') || err?.message?.includes('unsupported')) {
130
+ this.logger.warn(`Failed to load image ${uri.slice(0, 128)}`, { err });
131
+ throw new SubmittedDataMalformedError(`Unknown image format for ${uri.slice(0, 128)}`);
132
+ }
133
+ throw err;
134
+ }
135
+ }
136
+
137
+ fitImageToSquareBox(image: canvas.Image | canvas.Canvas, size: number = 1024) {
138
+ // this.logger.debug(`Fitting image(${ image.width }x${ image.height }) to ${ size } box`);
139
+ // const t0 = Date.now();
140
+ if (image.width <= size && image.height <= size) {
141
+ if (image instanceof this.canvas.Canvas) {
142
+ return image;
143
+ }
144
+ const canvasInstance = this.canvas.createCanvas(image.width, image.height);
145
+ const ctx = canvasInstance.getContext('2d');
146
+ ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, canvasInstance.width, canvasInstance.height);
147
+ // this.logger.debug(`No need to resize, copied to canvas in ${ Date.now() - t0 } ms`);
148
+
149
+ return canvasInstance;
150
+ }
151
+
152
+ const aspectRatio = image.width / image.height;
153
+
154
+ const resizedWidth = Math.round(aspectRatio > 1 ? size : size * aspectRatio);
155
+ const resizedHeight = Math.round(aspectRatio > 1 ? size / aspectRatio : size);
156
+
157
+ const canvasInstance = this.canvas.createCanvas(resizedWidth, resizedHeight);
158
+ const ctx = canvasInstance.getContext('2d');
159
+ ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, resizedWidth, resizedHeight);
160
+ // this.logger.debug(`Resized to ${ resizedWidth }x${ resizedHeight } in ${ Date.now() - t0 } ms`);
161
+
162
+ return canvasInstance;
163
+ }
164
+
165
+ corpImage(image: canvas.Image | canvas.Canvas, x: number, y: number, w: number, h: number) {
166
+ // this.logger.debug(`Cropping image(${ image.width }x${ image.height }) to ${ w }x${ h } at ${ x },${ y } `);
167
+ // const t0 = Date.now();
168
+ const canvasInstance = this.canvas.createCanvas(w, h);
169
+ const ctx = canvasInstance.getContext('2d');
170
+ ctx.drawImage(image, x, y, w, h, 0, 0, w, h);
171
+ // this.logger.debug(`Crop complete in ${ Date.now() - t0 } ms`);
172
+
173
+ return canvasInstance;
174
+ }
175
+
176
+ canvasToDataUrl(canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') {
177
+ // this.logger.debug(`Exporting canvas(${ canvas.width }x${ canvas.height })`);
178
+ // const t0 = Date.now();
179
+ return canvas.toDataURLAsync((mimeType || 'image/png') as 'image/png');
180
+ }
181
+
182
+ async canvasToBuffer(canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') {
183
+ // this.logger.debug(`Exporting canvas(${ canvas.width }x${ canvas.height })`);
184
+ // const t0 = Date.now();
185
+ return canvas.toBuffer((mimeType || 'image/png') as 'image/png');
186
+ }
187
+
188
+ }
189
+
190
+ const instance = container.resolve(CanvasService);
191
+ export default instance;
src/services/jsdom.ts CHANGED
@@ -169,10 +169,12 @@ export class JSDomControl extends AsyncService {
169
  Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
170
  .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')])
171
  .forEach(([u1, u2, alt]) => {
 
172
  if (u1) {
173
  try {
174
  const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString();
175
  imgSet.add(u1Txt);
 
176
  } catch (err) {
177
  // void 0;
178
  }
@@ -181,14 +183,17 @@ export class JSDomControl extends AsyncService {
181
  try {
182
  const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString();
183
  imgSet.add(u2Txt);
 
184
  } catch (err) {
185
  // void 0;
186
  }
187
  }
188
- rebuiltImgs.push({
189
- src: u1 || u2,
190
- alt
191
- });
 
 
192
  });
193
 
194
  const r = {
 
169
  Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
170
  .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')])
171
  .forEach(([u1, u2, alt]) => {
172
+ let absUrl: string | undefined;
173
  if (u1) {
174
  try {
175
  const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString();
176
  imgSet.add(u1Txt);
177
+ absUrl = u1Txt;
178
  } catch (err) {
179
  // void 0;
180
  }
 
183
  try {
184
  const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString();
185
  imgSet.add(u2Txt);
186
+ absUrl = u2Txt;
187
  } catch (err) {
188
  // void 0;
189
  }
190
  }
191
+ if (absUrl) {
192
+ rebuiltImgs.push({
193
+ src: absUrl,
194
+ alt
195
+ });
196
+ }
197
  });
198
 
199
  const r = {
src/services/puppeteer.ts CHANGED
@@ -395,7 +395,7 @@ function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
395
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
396
  href: document.location.href,
397
  html: document.documentElement?.outerHTML,
398
- htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.1),
399
  text: document.body?.innerText,
400
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
401
  parsed: parsed,
@@ -407,16 +407,18 @@ function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
407
  if (document.baseURI !== r.href) {
408
  r.rebase = document.baseURI;
409
  }
410
- if (parsed && parsed.content) {
411
- const elem = document.createElement('div');
412
- elem.innerHTML = parsed.content;
413
- r.imgs = briefImgs(elem);
414
- } else {
415
- const allImgs = briefImgs();
416
- if (allImgs.length === 1) {
417
- r.imgs = allImgs;
418
  }
419
- }
 
 
 
 
 
420
 
421
  return r;
422
  }
@@ -756,7 +758,7 @@ export class PuppeteerControl extends AsyncService {
756
  dElem = delta /(previousElemCount + Number.EPSILON);
757
  }
758
 
759
- if (dt < 1500 && dElem < 0.1) {
760
  return;
761
  }
762
 
 
395
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
396
  href: document.location.href,
397
  html: document.documentElement?.outerHTML,
398
+ htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.05),
399
  text: document.body?.innerText,
400
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
401
  parsed: parsed,
 
407
  if (document.baseURI !== r.href) {
408
  r.rebase = document.baseURI;
409
  }
410
+ r.imgs = briefImgs().filter((x)=> {
411
+ if (x.complete) {
412
+ if (Math.min(x.width, x.height, x.naturalWidth, x.naturalHeight) < 64) {
413
+ return false;
414
+ }
 
 
 
415
  }
416
+ const m = Math.min(x.width, x.height);
417
+ if (m && m < 64) {
418
+ return false;
419
+ }
420
+ return true;
421
+ });
422
 
423
  return r;
424
  }
 
758
  dElem = delta /(previousElemCount + Number.EPSILON);
759
  }
760
 
761
+ if (dt < 1200 && dElem < 0.05) {
762
  return;
763
  }
764
 
src/services/snapshot-formatter.ts CHANGED
@@ -213,6 +213,7 @@ export class SnapshotFormatter extends AsyncService {
213
  const imageSummary = {} as { [k: string]: string; };
214
  const imageIdxTrack = new Map<string, number[]>();
215
  const uid = this.threadLocal.get('uid');
 
216
  do {
217
  if (pdfMode) {
218
  contentText = (snapshot.parsed?.content || snapshot.text || '').trim();
@@ -229,10 +230,10 @@ export class SnapshotFormatter extends AsyncService {
229
  break;
230
  }
231
 
232
- const urlToAltMap: { [k: string]: string | undefined; } = {};
233
  const noGFMOpts = this.threadLocal.get('noGfm');
234
  const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
235
  let imgIdx = 0;
 
236
  const customRules: { [k: string]: Rule; } = {
237
  'img-retention': {
238
  filter: 'img',
@@ -267,41 +268,37 @@ export class SnapshotFormatter extends AsyncService {
267
  if (!src) {
268
  return '';
269
  }
270
- const mapped = urlToAltMap[originalSrc];
 
 
271
  const imgSerial = ++imgIdx;
272
- const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
273
  idxArr.push(imgSerial);
274
- imageIdxTrack.set(src, idxArr);
275
 
276
  if (mapped) {
277
- imageSummary[src] = mapped || alt;
278
 
279
  if (imageRetention === 'alt_p') {
280
- return `(Image ${imgIdx}: ${mapped || alt})`;
281
  }
282
 
283
- if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
284
- const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
285
- mappedUrl.protocol = 'blob:';
286
-
287
- return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
288
  }
289
 
290
- return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
291
  } else if (imageRetention === 'alt_p') {
292
- return alt ? `(Image ${imgIdx}: ${alt})` : '';
293
  }
294
 
295
- imageSummary[src] = alt || '';
296
-
297
- if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
298
- const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
299
- mappedUrl.protocol = 'blob:';
300
 
301
- return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
 
302
  }
303
 
304
- return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
305
  }
306
  } as Rule
307
  };
@@ -343,7 +340,9 @@ export class SnapshotFormatter extends AsyncService {
343
  return undefined;
344
  });
345
  if (r && x.src) {
346
- urlToAltMap[x.src.trim()] = r;
 
 
347
  }
348
  });
349
 
@@ -416,13 +415,10 @@ export class SnapshotFormatter extends AsyncService {
416
  .toPairs()
417
  .map(
418
  ([url, alt], i) => {
419
- if (imgDataUrlToObjectUrl && url.startsWith('data:')) {
420
- const refUrl = new URL(formatted.url!);
421
- const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(url)}`);
422
 
423
- url = mappedUrl.toString();
424
- }
425
- return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
426
  }
427
  ).fromPairs()
428
  .value();
@@ -522,6 +518,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
522
  return f as FormattedPage;
523
  }
524
 
 
 
 
 
 
 
 
525
  async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
526
  let inferred;
527
  const mixin: any = {};
@@ -534,10 +537,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
534
 
535
  for (const img of inferred.imgs) {
536
  const imgSerial = ++imgIdx;
537
- const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
 
538
  idxArr.push(imgSerial);
539
- imageIdxTrack.set(img.src, idxArr);
540
- imageSummary[img.src] = img.alt || '';
541
  }
542
 
543
  mixin.images =
@@ -545,7 +549,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
545
  .toPairs()
546
  .map(
547
  ([url, alt], i) => {
548
- return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
 
 
 
549
  }
550
  ).fromPairs()
551
  .value();
@@ -611,14 +618,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
611
  const src = (node.getAttribute('src') || '').trim();
612
  const alt = cleanAttribute(node.getAttribute('alt')) || '';
613
 
614
- if (options.url) {
615
- const refUrl = new URL(options.url);
616
- const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
617
-
618
- return `![${alt}](${mappedUrl})`;
619
- }
620
 
621
- return `![${alt}](blob:${md5Hasher.hash(src)})`;
622
  }
623
  });
624
  }
@@ -817,6 +819,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
817
  if (contentType.startsWith('image/')) {
818
  snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${fileName}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${url.href}"></body></html>`;
819
  snapshot.title = fileName;
 
820
 
821
  return snapshot;
822
  }
 
213
  const imageSummary = {} as { [k: string]: string; };
214
  const imageIdxTrack = new Map<string, number[]>();
215
  const uid = this.threadLocal.get('uid');
216
+
217
  do {
218
  if (pdfMode) {
219
  contentText = (snapshot.parsed?.content || snapshot.text || '').trim();
 
230
  break;
231
  }
232
 
 
233
  const noGFMOpts = this.threadLocal.get('noGfm');
234
  const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
235
  let imgIdx = 0;
236
+ const urlToAltMap: { [k: string]: string | undefined; } = {};
237
  const customRules: { [k: string]: Rule; } = {
238
  'img-retention': {
239
  filter: 'img',
 
268
  if (!src) {
269
  return '';
270
  }
271
+
272
+ const keySrc = (originalSrc.startsWith('data:') ? this.dataUrlToBlobUrl(originalSrc, snapshot.rebase) : src).trim();
273
+ const mapped = urlToAltMap[keySrc];
274
  const imgSerial = ++imgIdx;
275
+ const idxArr = imageIdxTrack.has(keySrc) ? imageIdxTrack.get(keySrc)! : [];
276
  idxArr.push(imgSerial);
277
+ imageIdxTrack.set(keySrc, idxArr);
278
 
279
  if (mapped) {
280
+ imageSummary[keySrc] = mapped || alt;
281
 
282
  if (imageRetention === 'alt_p') {
283
+ return `(Image ${imgSerial}: ${mapped || alt})`;
284
  }
285
 
286
+ if (imgDataUrlToObjectUrl) {
287
+ return `![Image ${imgSerial}: ${mapped || alt}](${keySrc})`;
 
 
 
288
  }
289
 
290
+ return `![Image ${imgSerial}: ${mapped || alt}](${src})`;
291
  } else if (imageRetention === 'alt_p') {
292
+ return alt ? `(Image ${imgSerial}: ${alt})` : '';
293
  }
294
 
295
+ imageSummary[keySrc] = alt || '';
 
 
 
 
296
 
297
+ if (imgDataUrlToObjectUrl) {
298
+ return alt ? `![Image ${imgSerial}: ${alt}](${keySrc})` : `![Image ${imgSerial}](${keySrc})`;
299
  }
300
 
301
+ return alt ? `![Image ${imgSerial}: ${alt}](${src})` : `![Image ${imgSerial}](${src})`;
302
  }
303
  } as Rule
304
  };
 
340
  return undefined;
341
  });
342
  if (r && x.src) {
343
+ // note x.src here is already rebased to absolute url by browser/upstream.
344
+ const keySrc = (x.src.startsWith('data:') ? this.dataUrlToBlobUrl(x.src, snapshot.rebase) : x.src).trim();
345
+ urlToAltMap[keySrc] = r;
346
  }
347
  });
348
 
 
415
  .toPairs()
416
  .map(
417
  ([url, alt], i) => {
418
+ const idxTrack = imageIdxTrack.get(url);
419
+ const tag = idxTrack?.length ? `Image ${_.uniq(idxTrack).join(',')}` : `Hidden Image ${i + 1}`;
 
420
 
421
+ return [`${tag}${alt ? `: ${alt}` : ''}`, url];
 
 
422
  }
423
  ).fromPairs()
424
  .value();
 
518
  return f as FormattedPage;
519
  }
520
 
521
+ dataUrlToBlobUrl(dataUrl: string, baseUrl: string = 'http://localhost/') {
522
+ const refUrl = new URL(baseUrl);
523
+ const mappedUrl = new URL(`blob:${refUrl.origin || 'localhost'}/${md5Hasher.hash(dataUrl)}`);
524
+
525
+ return mappedUrl.href;
526
+ }
527
+
528
  async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
529
  let inferred;
530
  const mixin: any = {};
 
537
 
538
  for (const img of inferred.imgs) {
539
  const imgSerial = ++imgIdx;
540
+ const keySrc = (img.src.startsWith('data:') ? this.dataUrlToBlobUrl(img.src, snapshot.rebase) : img.src).trim();
541
+ const idxArr = imageIdxTrack.has(keySrc) ? imageIdxTrack.get(keySrc)! : [];
542
  idxArr.push(imgSerial);
543
+ imageIdxTrack.set(keySrc, idxArr);
544
+ imageSummary[keySrc] = img.alt || '';
545
  }
546
 
547
  mixin.images =
 
549
  .toPairs()
550
  .map(
551
  ([url, alt], i) => {
552
+ const idxTrack = imageIdxTrack.get(url);
553
+ const tag = idxTrack?.length ? `Image ${_.uniq(idxTrack).join(',')}` : `Hidden Image ${i + 1}`;
554
+
555
+ return [`${tag}${alt ? `: ${alt}` : ''}`, url];
556
  }
557
  ).fromPairs()
558
  .value();
 
618
  const src = (node.getAttribute('src') || '').trim();
619
  const alt = cleanAttribute(node.getAttribute('alt')) || '';
620
 
621
+ const blobUrl = this.dataUrlToBlobUrl(src, options.url?.toString());
 
 
 
 
 
622
 
623
+ return `![${alt}](${blobUrl})`;
624
  }
625
  });
626
  }
 
819
  if (contentType.startsWith('image/')) {
820
  snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${fileName}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${url.href}"></body></html>`;
821
  snapshot.title = fileName;
822
+ snapshot.imgs = [{ src: url.href }];
823
 
824
  return snapshot;
825
  }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 4b1061e6e9623bb98b82ac6f86004988c7211385
 
1
+ Subproject commit a2ebcb882fa92644cc3dfd6b8d8e66f06dd940e9