hanxiao nomagick commited on
Commit
b3fb4c5
·
unverified ·
1 Parent(s): 1837362

feat: add image captioning (#6)

Browse files

* Fix contentText assignment in CrawlerHost class

* fix: recover vscode configurations

* feat: add image captioning

* feat: add image captioning

* clean: vscode config

* chore: fix some ts warnings

* feat: auto alt text

* fix

* chore: improve prompt

* clean: unused config

* fix: failure condition

* fix: remove redundant code

* fix: catch parse error

* fix: catch parse error

---------

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

.gitignore CHANGED
@@ -1,4 +1,2 @@
1
  node_modules/
2
- .DS_Store
3
- .vscode
4
- .cache
 
1
  node_modules/
2
+ .DS_Store
 
 
.vscode/exensions.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "recommendations": [
3
+ "editorconfig.editorconfig",
4
+ "octref.vetur",
5
+ "redhat.vscode-yaml",
6
+ "dbaeumer.vscode-eslint",
7
+ "esbenp.prettier-vscode",
8
+ "streetsidesoftware.code-spell-checker"
9
+ ]
10
+ }
.vscode/launch.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.2.0",
3
+ "configurations": [
4
+ {
5
+ "name": "Debug Fullstack: attach",
6
+ "request": "attach",
7
+ "cwd": "${workspaceFolder}/backend/functions",
8
+ "skipFiles": [
9
+ "<node_internals>/**"
10
+ ],
11
+ "type": "node",
12
+ "preLaunchTask": "Fullstack:debug"
13
+ },
14
+ {
15
+ "name": "Debug Fullstack: attach: with proxy",
16
+ "request": "attach",
17
+ "cwd": "${workspaceFolder}/backend/functions",
18
+ "skipFiles": [
19
+ "<node_internals>/**"
20
+ ],
21
+ "type": "node",
22
+ "preLaunchTask": "Fullstack:debug:with-proxy"
23
+ },
24
+ {
25
+ "name": "Attach",
26
+ "port": 9229,
27
+ "request": "attach",
28
+ "skipFiles": [
29
+ "<node_internals>/**"
30
+ ],
31
+ "type": "node"
32
+ },
33
+ {
34
+ "name": "Attach by Process ID",
35
+ "processId": "${command:PickProcess}",
36
+ "request": "attach",
37
+ "skipFiles": [
38
+ "<node_internals>/**"
39
+ ],
40
+ "type": "node"
41
+ },
42
+ {
43
+ "name": "Debug Fullstack",
44
+ "request": "launch",
45
+ "runtimeArgs": [
46
+ "emulators:start",
47
+ "--import=../.firebase-emu",
48
+ "--export-on-exit=../.firebase-emu",
49
+ ],
50
+ "cwd": "${workspaceFolder}/backend/functions",
51
+ "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
52
+ "skipFiles": [
53
+ "<node_internals>/**"
54
+ ],
55
+ "type": "node",
56
+ "preLaunchTask": "Fullstack:prepare",
57
+ "killBehavior": "polite"
58
+ },
59
+ ]
60
+ }
.vscode/settings.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.wordWrap": "on",
3
+ "editor.wordWrapColumn": 120,
4
+ "files.trimTrailingWhitespace": true,
5
+ "files.trimFinalNewlines": true,
6
+ "[javascript]": {
7
+ "editor.defaultFormatter": "vscode.typescript-language-features"
8
+ },
9
+ "[jsonc]": {
10
+ "editor.defaultFormatter": "vscode.json-language-features"
11
+ },
12
+ "[typescript]": {
13
+ "editor.defaultFormatter": "vscode.typescript-language-features"
14
+ },
15
+ "[json]": {
16
+ "editor.defaultFormatter": "vscode.json-language-features"
17
+ },
18
+ "[yaml]": {
19
+ "editor.defaultFormatter": "redhat.vscode-yaml"
20
+ },
21
+ "[markdown]": {
22
+ "files.trimTrailingWhitespace": false
23
+ },
24
+ "typescript.tsdk": "node_modules/typescript/lib",
25
+ "typescript.preferences.quoteStyle": "single",
26
+ "typescript.format.semicolons": "insert",
27
+ "typescript.preferences.importModuleSpecifier": "project-relative",
28
+ "typescript.locale": "en",
29
+ "cSpell.enabled": true,
30
+ "cSpell.words": [
31
+ ],
32
+ }
.vscode/tasks.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "2.0.0",
3
+ "tasks": [
4
+ {
5
+ "type": "npm",
6
+ "script": "build",
7
+ "group": "build",
8
+ "options": {
9
+ "cwd": "${workspaceFolder}/backend/functions"
10
+ },
11
+ "problemMatcher": [],
12
+ "label": "Backend:rebuild",
13
+ "detail": "Backend:rebuild"
14
+ },
15
+ {
16
+ "type": "npm",
17
+ "script": "emu:reset",
18
+ "group": "build",
19
+ "options": {
20
+ "cwd": "${workspaceFolder}/backend/functions"
21
+ },
22
+ "problemMatcher": [],
23
+ "label": "Backend:reset-emulator",
24
+ "detail": "Backend:reset-emulator"
25
+ },
26
+ {
27
+ "type": "typescript",
28
+ "options": {
29
+ "cwd": "${workspaceFolder}/backend/functions"
30
+ },
31
+ "tsconfig": "backend/functions/tsconfig.json",
32
+ "option": "watch",
33
+ "isBackground": true,
34
+ "problemMatcher": [
35
+ "$tsc-watch"
36
+ ],
37
+ "group": "build",
38
+ "label": "Backend:build:watch"
39
+ },
40
+ {
41
+ "type": "npm",
42
+ "script": "emu:debug",
43
+ "group": "none",
44
+ "options": {
45
+ "cwd": "${workspaceFolder}/backend/functions"
46
+ },
47
+ "problemMatcher": [
48
+ {
49
+ "base": "$tsc",
50
+ "background": {
51
+ "activeOnStart": false,
52
+ "beginsPattern": "shutdown requested|Starting emulators",
53
+ "endsPattern": "Debugger listening"
54
+ }
55
+ }
56
+ ],
57
+ "label": "Backend:start-emulator-debug",
58
+ "detail": "Backend:start-emulator-debug",
59
+ "dependsOn": [
60
+ "Backend:build:watch"
61
+ ],
62
+ "isBackground": true,
63
+ },
64
+ {
65
+ "type": "npm",
66
+ "script": "dev",
67
+ "options": {
68
+ "cwd": "${workspaceFolder}/webapp",
69
+ },
70
+ "group": "build",
71
+ "label": "Frontend:start:dev",
72
+ "detail": "Frontend:start:dev",
73
+ "isBackground": true,
74
+ "problemMatcher": {
75
+ "base": "$vite",
76
+ "background": {
77
+ "activeOnStart": true,
78
+ "endsPattern": "OK",
79
+ "beginsPattern": "vite"
80
+ }
81
+ },
82
+ },
83
+ {
84
+ "type": "npm",
85
+ "script": "dev",
86
+ "options": {
87
+ "cwd": "${workspaceFolder}/webapp",
88
+ "env": {
89
+ "FIREBASE_EMULATE": "true",
90
+ }
91
+ },
92
+ "group": "build",
93
+ "label": "Frontend:start:emu",
94
+ "detail": "Frontend:start:emu",
95
+ "isBackground": true,
96
+ "problemMatcher": {
97
+ "base": "$vite",
98
+ "background": {
99
+ "activeOnStart": true,
100
+ "endsPattern": "OK",
101
+ "beginsPattern": "vite"
102
+ }
103
+ },
104
+ },
105
+ {
106
+ "type": "npm",
107
+ "script": "emu:debug2",
108
+ "group": "none",
109
+ "options": {
110
+ "cwd": "${workspaceFolder}/backend/functions",
111
+ "env": {
112
+ "https_proxy": "http://127.0.0.1:7890",
113
+ "http_proxy": "http://127.0.0.1:7890",
114
+ "all_proxy": "socks5://127.0.0.1:7890"
115
+ }
116
+ },
117
+ "problemMatcher": [
118
+ {
119
+ "base": "$tsc",
120
+ "background": {
121
+ "activeOnStart": false,
122
+ "beginsPattern": "shutdown requested|Starting emulators",
123
+ "endsPattern": "Debugger listening"
124
+ }
125
+ }
126
+ ],
127
+ "label": "Backend:start-emulator-debug:with-proxy",
128
+ "detail": "Backend:start-emulator-debug:with-proxy",
129
+ "dependsOn": [
130
+ "Backend:build:watch"
131
+ ],
132
+ "isBackground": true,
133
+ },
134
+ {
135
+ "label": "Fullstack:prepare",
136
+ "dependsOn": [
137
+ "Frontend:start:emu",
138
+ "Backend:build:watch",
139
+ ],
140
+ },
141
+ {
142
+ "label": "Fullstack:debug",
143
+ "dependsOn": [
144
+ // "Frontend:start:emu",
145
+ "Backend:start-emulator-debug",
146
+ ],
147
+ },
148
+ {
149
+ "label": "Fullstack:debug:with-proxy",
150
+ "dependsOn": [
151
+ "Frontend:start:emu",
152
+ "Backend:start-emulator-debug:with-proxy",
153
+ ],
154
+ }
155
+ ]
156
+ }
backend/functions/.puppeteerrc.cjs CHANGED
@@ -1,14 +1,9 @@
1
  const { join } = require('path');
2
 
3
- let config = {};
4
- if (!process.env.FUNCTIONS_EMULATOR) {
5
- config = {
6
- // Changes the cache location for Puppeteer.
7
- cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
8
- };
9
- }
10
-
11
  /**
12
  * @type {import("puppeteer").Configuration}
13
  */
14
- module.exports = config;
 
 
 
 
1
  const { join } = require('path');
2
 
 
 
 
 
 
 
 
 
3
  /**
4
  * @type {import("puppeteer").Configuration}
5
  */
6
+ module.exports = {
7
+ // Changes the cache location for Puppeteer.
8
+ cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
9
+ };
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -3,9 +3,10 @@ import { singleton } from 'tsyringe';
3
  import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
4
  import _ from 'lodash';
5
  import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
6
- import TurnDownService from 'turndown';
7
  import { Request, Response } from 'express';
8
  import normalizeUrl from "@esm2cjs/normalize-url";
 
 
9
 
10
  function tidyMarkdown(markdown: string): string {
11
 
@@ -50,11 +51,14 @@ function tidyMarkdown(markdown: string): string {
50
  export class CrawlerHost extends RPCHost {
51
  logger = this.globalLogger.child({ service: this.constructor.name });
52
 
53
- turnDownService = new TurnDownService().use(require('turndown-plugin-gfm').gfm);
 
 
54
 
55
  constructor(
56
  protected globalLogger: Logger,
57
  protected puppeteerControl: PuppeteerControl,
 
58
  ) {
59
  super(...arguments);
60
  }
@@ -65,14 +69,57 @@ export class CrawlerHost extends RPCHost {
65
  this.emit('ready');
66
  }
67
 
68
- formatSnapshot(snapshot: PageSnapshot) {
69
-
70
  const toBeTurnedToMd = snapshot.parsed?.content;
71
- const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : '';
 
 
 
72
 
73
- const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- const cleanText = tidyMarkdown(contentText).trim();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  const formatted = {
78
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
@@ -142,7 +189,7 @@ ${this.content}
142
  continue;
143
  }
144
 
145
- const formatted = this.formatSnapshot(scrapped);
146
 
147
  if (scrapped.screenshot && screenshotEnabled) {
148
  sseStream.write({
@@ -177,7 +224,7 @@ ${this.content}
177
  continue;
178
  }
179
 
180
- const formatted = this.formatSnapshot(scrapped);
181
 
182
  return formatted;
183
  }
@@ -186,7 +233,7 @@ ${this.content}
186
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
187
  }
188
 
189
- return this.formatSnapshot(lastScrapped);
190
  }
191
 
192
  for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
@@ -195,7 +242,7 @@ ${this.content}
195
  continue;
196
  }
197
 
198
- const formatted = this.formatSnapshot(scrapped);
199
 
200
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
201
  }
@@ -204,8 +251,12 @@ ${this.content}
204
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
205
  }
206
 
207
- return `${this.formatSnapshot(lastScrapped)}`;
208
  }
209
 
210
 
211
  }
 
 
 
 
 
3
  import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
4
  import _ from 'lodash';
5
  import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
 
6
  import { Request, Response } from 'express';
7
  import normalizeUrl from "@esm2cjs/normalize-url";
8
+ import { AltTextService } from '../services/alt-text';
9
+ import TurndownService from 'turndown';
10
 
11
  function tidyMarkdown(markdown: string): string {
12
 
 
51
  export class CrawlerHost extends RPCHost {
52
  logger = this.globalLogger.child({ service: this.constructor.name });
53
 
54
+ turnDownPlugins = [require('turndown-plugin-gfm').gfm];
55
+
56
+ imageShortUrlPrefix?: string;
57
 
58
  constructor(
59
  protected globalLogger: Logger,
60
  protected puppeteerControl: PuppeteerControl,
61
+ protected altTextService: AltTextService,
62
  ) {
63
  super(...arguments);
64
  }
 
69
  this.emit('ready');
70
  }
71
 
72
+ async formatSnapshot(snapshot: PageSnapshot) {
 
73
  const toBeTurnedToMd = snapshot.parsed?.content;
74
+ let turnDownService = new TurndownService();
75
+ for (const plugin of this.turnDownPlugins) {
76
+ turnDownService = turnDownService.use(plugin);
77
+ }
78
 
79
+ let contentText = '';
80
+ if (toBeTurnedToMd) {
81
+ const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {};
82
+ const tasks = (snapshot.imgs || []).map(async (x) => {
83
+ const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> {
84
+ this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
85
+ return undefined;
86
+ });
87
+ if (r) {
88
+ urlToAltMap[x.src.trim()] = r;
89
+ }
90
+ });
91
 
92
+ await Promise.all(tasks);
93
+ let imgIdx = 0;
94
+
95
+ turnDownService.addRule('img-generated-alt', {
96
+ filter: 'img',
97
+ replacement: (_content, node) => {
98
+ const src = (node.getAttribute('src') || '').trim();
99
+ const alt = cleanAttribute(node.getAttribute('alt'));
100
+ if (!src) {
101
+ return '';
102
+ }
103
+ const mapped = urlToAltMap[src];
104
+ imgIdx++;
105
+ if (mapped) {
106
+ return `![Image ${imgIdx}: ${mapped.alt || alt}](${this.imageShortUrlPrefix ? `${this.imageShortUrlPrefix}/${mapped.shortDigest}` : src})`;
107
+ }
108
+ return `![Image ${imgIdx}: ${alt}](${src})`;
109
+ }
110
+ });
111
+
112
+ contentText = turnDownService.turndown(toBeTurnedToMd).trim();
113
+ }
114
+
115
+ if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
116
+ contentText = turnDownService.turndown(snapshot.html);
117
+ }
118
+ if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
119
+ contentText = snapshot.text;
120
+ }
121
+
122
+ const cleanText = tidyMarkdown(contentText || '').trim();
123
 
124
  const formatted = {
125
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
 
189
  continue;
190
  }
191
 
192
+ const formatted = await this.formatSnapshot(scrapped);
193
 
194
  if (scrapped.screenshot && screenshotEnabled) {
195
  sseStream.write({
 
224
  continue;
225
  }
226
 
227
+ const formatted = await this.formatSnapshot(scrapped);
228
 
229
  return formatted;
230
  }
 
233
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
234
  }
235
 
236
+ return await this.formatSnapshot(lastScrapped);
237
  }
238
 
239
  for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
 
242
  continue;
243
  }
244
 
245
+ const formatted = await this.formatSnapshot(scrapped);
246
 
247
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
248
  }
 
251
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
252
  }
253
 
254
+ return `${await this.formatSnapshot(lastScrapped)}`;
255
  }
256
 
257
 
258
  }
259
+
260
+ function cleanAttribute(attribute: string) {
261
+ return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
262
+ }
backend/functions/src/db/img-alt.ts ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Also, Prop } from 'civkit';
2
+ import { FirestoreRecord } from '../shared/lib/firestore';
3
+ import _ from 'lodash';
4
+
5
+ @Also({
6
+ dictOf: Object
7
+ })
8
+ export class ImgAlt extends FirestoreRecord {
9
+ static override collectionName = 'imgAlts';
10
+
11
+ override _id!: string;
12
+
13
+ @Prop({
14
+ required: true
15
+ })
16
+ src!: string;
17
+
18
+ @Prop({
19
+ required: true
20
+ })
21
+ urlDigest!: string;
22
+
23
+ @Prop()
24
+ width?: number;
25
+
26
+ @Prop()
27
+ height?: number;
28
+
29
+ @Prop()
30
+ generatedAlt?: string;
31
+
32
+ @Prop()
33
+ originalAlt?: string;
34
+
35
+ @Prop()
36
+ createdAt!: Date;
37
+
38
+ @Prop()
39
+ expireAt?: Date;
40
+
41
+ [k: string]: any;
42
+ }
backend/functions/src/services/alt-text.ts ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
2
+ import { singleton } from 'tsyringe';
3
+ import { Logger } from '../shared/services/logger';
4
+ import { CanvasService } from '../shared/services/canvas';
5
+ import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
6
+ import { ImgBrief } from './puppeteer';
7
+ import { ImgAlt } from '../db/img-alt';
8
+
9
+
10
+ const md5Hasher = new HashManager('md5', 'hex');
11
+
12
+ @singleton()
13
+ export class AltTextService extends AsyncService {
14
+
15
+ logger = this.globalLogger.child({ service: this.constructor.name });
16
+
17
+ constructor(
18
+ protected globalLogger: Logger,
19
+ protected imageInterrogator: ImageInterrogationManager,
20
+ protected canvasService: CanvasService
21
+ ) {
22
+ super(...arguments);
23
+ }
24
+
25
+ override async init() {
26
+ await this.dependencyReady();
27
+ this.emit('ready');
28
+ }
29
+
30
+ async caption(url: string) {
31
+ try {
32
+ const img = await this.canvasService.loadImage(url);
33
+ const resized = this.canvasService.fitImageToSquareBox(img, 1024);
34
+ const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
35
+
36
+ const r = await this.imageInterrogator.interrogate('blip2', {
37
+ image: exported,
38
+ // prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
39
+ });
40
+
41
+ return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
42
+ } catch (err) {
43
+ throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err });
44
+ }
45
+ }
46
+
47
+ async getAltTextAndShortDigest(imgBrief: ImgBrief) {
48
+ if (!imgBrief.src) {
49
+ return undefined;
50
+ }
51
+ const digest = md5Hasher.hash(imgBrief.src);
52
+ const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
53
+
54
+ const existing = await ImgAlt.fromFirestore(shortDigest);
55
+
56
+ if (existing?.generatedAlt) {
57
+ return {
58
+ shortDigest,
59
+ alt: existing.generatedAlt,
60
+ };
61
+ }
62
+
63
+ let generatedCaption;
64
+
65
+ if (!imgBrief.alt) {
66
+ try {
67
+ generatedCaption = await this.caption(imgBrief.src);
68
+ } catch (err) {
69
+ this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
70
+ }
71
+ }
72
+
73
+ await ImgAlt.COLLECTION.doc(shortDigest).set(
74
+ {
75
+ _id: shortDigest,
76
+ src: imgBrief.src || '',
77
+ width: imgBrief.naturalWidth || 0,
78
+ height: imgBrief.naturalHeight || 0,
79
+ urlDigest: digest,
80
+ originalAlt: imgBrief.alt || '',
81
+ generatedAlt: generatedCaption || '',
82
+ createdAt: new Date()
83
+ }, { merge: true }
84
+ );
85
+
86
+ return {
87
+ shortDigest,
88
+ alt: generatedCaption,
89
+ };
90
+ }
91
+ }
backend/functions/src/services/puppeteer.ts CHANGED
@@ -7,11 +7,19 @@ import os from 'os';
7
  import fs from 'fs';
8
  import { Crawled } from '../db/crawled';
9
  import puppeteer from 'puppeteer-extra';
10
- import puppeteerStealth from 'puppeteer-extra-plugin-stealth';
11
-
12
 
13
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
14
 
 
 
 
 
 
 
 
 
 
 
15
  export interface PageSnapshot {
16
  title: string;
17
  href: string;
@@ -30,13 +38,16 @@ export interface PageSnapshot {
30
  publishedTime: string;
31
  } | null;
32
  screenshot?: Buffer;
 
33
  }
34
  const md5Hasher = new HashManager('md5', 'hex');
35
 
 
36
  puppeteer.use(puppeteerStealth());
37
  // const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
38
  // puppeteer.use(puppeteerUAOverride({
39
- // userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
 
40
  // }))
41
 
42
  @singleton()
@@ -84,7 +95,7 @@ export class PuppeteerControl extends AsyncService {
84
  this.browser = await puppeteer.launch({
85
  headless: true,
86
  timeout: 10_000
87
- }).catch((err) => {
88
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
89
  process.nextTick(() => {
90
  this.emit('error', err);
@@ -117,23 +128,42 @@ export class PuppeteerControl extends AsyncService {
117
  }));
118
  preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
119
  preparations.push(page.evaluateOnNewDocument(`
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  function giveSnapshot() {
121
- let parsedContent;
122
  try {
123
- // Attempt to parse the cloned document
124
- parsedContent = new Readability(document.cloneNode(true)).parse();
125
- } catch (error) {
126
- // If an error occurs, log it and set parsedContent to undefined
127
- parsedContent = undefined;
128
  }
129
 
130
- return {
131
  title: document.title,
132
  href: document.location.href,
133
  html: document.documentElement.outerHTML,
134
  text: document.body.innerText,
135
- parsed: parsedContent
 
136
  };
 
 
 
 
 
 
 
137
  }
138
  `));
139
  preparations.push(page.evaluateOnNewDocument(() => {
 
7
  import fs from 'fs';
8
  import { Crawled } from '../db/crawled';
9
  import puppeteer from 'puppeteer-extra';
 
 
10
 
11
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
12
 
13
+ export interface ImgBrief {
14
+ src: string;
15
+ loaded: boolean;
16
+ width: number;
17
+ height: number;
18
+ naturalWidth: number;
19
+ naturalHeight: number;
20
+ alt?: string;
21
+ }
22
+
23
  export interface PageSnapshot {
24
  title: string;
25
  href: string;
 
38
  publishedTime: string;
39
  } | null;
40
  screenshot?: Buffer;
41
+ imgs?: ImgBrief[];
42
  }
43
  const md5Hasher = new HashManager('md5', 'hex');
44
 
45
+ const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
46
  puppeteer.use(puppeteerStealth());
47
  // const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
48
  // puppeteer.use(puppeteerUAOverride({
49
+ // userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
50
+ // platform: `Linux`,
51
  // }))
52
 
53
  @singleton()
 
95
  this.browser = await puppeteer.launch({
96
  headless: true,
97
  timeout: 10_000
98
+ }).catch((err: any) => {
99
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
100
  process.nextTick(() => {
101
  this.emit('error', err);
 
128
  }));
129
  preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
130
  preparations.push(page.evaluateOnNewDocument(`
131
+ function briefImgs(elem) {
132
+ const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
133
+
134
+ return imageTags.map((x)=> ({
135
+ src: x.src,
136
+ loaded: x.complete,
137
+ width: x.width,
138
+ height: x.height,
139
+ naturalWidth: x.naturalWidth,
140
+ naturalHeight: x.naturalHeight,
141
+ alt: x.alt || x.title,
142
+ }));
143
+ }
144
  function giveSnapshot() {
145
+ let parsed;
146
  try {
147
+ parsed = new Readability(document.cloneNode(true)).parse();
148
+ } catch (err) {
149
+ void 0;
 
 
150
  }
151
 
152
+ const r = {
153
  title: document.title,
154
  href: document.location.href,
155
  html: document.documentElement.outerHTML,
156
  text: document.body.innerText,
157
+ parsed: parsed,
158
+ imgs: [],
159
  };
160
+ if (parsed && parsed.content) {
161
+ const elem = document.createElement('div');
162
+ elem.innerHTML = parsed.content;
163
+ r.imgs = briefImgs(elem);
164
+ }
165
+
166
+ return r;
167
  }
168
  `));
169
  preparations.push(page.evaluateOnNewDocument(() => {
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 9f0fa1dd7f8cfcea4c8d79252319b151fae6ed19
 
1
+ Subproject commit bea967a371581c1109dc0101dbcab196e9ed9ade