Spaces:
Build error
Build error
feat: add image captioning (#6)
Browse files* Fix contentText assignment in CrawlerHost class
* fix: recover vscode configurations
* feat: add image captioning
* feat: add image captioning
* clean: vscode config
* chore: fix some ts warnings
* feat: auto alt text
* fix
* chore: improve prompt
* clean: unused config
* fix: failure condition
* fix: remove redundant code
* fix: catch parse error
* fix: catch parse error
---------
Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
- .gitignore +1 -3
- .vscode/exensions.json +10 -0
- .vscode/launch.json +60 -0
- .vscode/settings.json +32 -0
- .vscode/tasks.json +156 -0
- backend/functions/.puppeteerrc.cjs +4 -9
- backend/functions/src/cloud-functions/crawler.ts +63 -12
- backend/functions/src/db/img-alt.ts +42 -0
- backend/functions/src/services/alt-text.ts +91 -0
- backend/functions/src/services/puppeteer.ts +42 -12
- thinapps-shared +1 -1
.gitignore
CHANGED
|
@@ -1,4 +1,2 @@
|
|
| 1 |
node_modules/
|
| 2 |
-
.DS_Store
|
| 3 |
-
.vscode
|
| 4 |
-
.cache
|
|
|
|
| 1 |
node_modules/
|
| 2 |
+
.DS_Store
|
|
|
|
|
|
.vscode/exensions.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"recommendations": [
|
| 3 |
+
"editorconfig.editorconfig",
|
| 4 |
+
"octref.vetur",
|
| 5 |
+
"redhat.vscode-yaml",
|
| 6 |
+
"dbaeumer.vscode-eslint",
|
| 7 |
+
"esbenp.prettier-vscode",
|
| 8 |
+
"streetsidesoftware.code-spell-checker"
|
| 9 |
+
]
|
| 10 |
+
}
|
.vscode/launch.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "0.2.0",
|
| 3 |
+
"configurations": [
|
| 4 |
+
{
|
| 5 |
+
"name": "Debug Fullstack: attach",
|
| 6 |
+
"request": "attach",
|
| 7 |
+
"cwd": "${workspaceFolder}/backend/functions",
|
| 8 |
+
"skipFiles": [
|
| 9 |
+
"<node_internals>/**"
|
| 10 |
+
],
|
| 11 |
+
"type": "node",
|
| 12 |
+
"preLaunchTask": "Fullstack:debug"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"name": "Debug Fullstack: attach: with proxy",
|
| 16 |
+
"request": "attach",
|
| 17 |
+
"cwd": "${workspaceFolder}/backend/functions",
|
| 18 |
+
"skipFiles": [
|
| 19 |
+
"<node_internals>/**"
|
| 20 |
+
],
|
| 21 |
+
"type": "node",
|
| 22 |
+
"preLaunchTask": "Fullstack:debug:with-proxy"
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"name": "Attach",
|
| 26 |
+
"port": 9229,
|
| 27 |
+
"request": "attach",
|
| 28 |
+
"skipFiles": [
|
| 29 |
+
"<node_internals>/**"
|
| 30 |
+
],
|
| 31 |
+
"type": "node"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"name": "Attach by Process ID",
|
| 35 |
+
"processId": "${command:PickProcess}",
|
| 36 |
+
"request": "attach",
|
| 37 |
+
"skipFiles": [
|
| 38 |
+
"<node_internals>/**"
|
| 39 |
+
],
|
| 40 |
+
"type": "node"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "Debug Fullstack",
|
| 44 |
+
"request": "launch",
|
| 45 |
+
"runtimeArgs": [
|
| 46 |
+
"emulators:start",
|
| 47 |
+
"--import=../.firebase-emu",
|
| 48 |
+
"--export-on-exit=../.firebase-emu",
|
| 49 |
+
],
|
| 50 |
+
"cwd": "${workspaceFolder}/backend/functions",
|
| 51 |
+
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
|
| 52 |
+
"skipFiles": [
|
| 53 |
+
"<node_internals>/**"
|
| 54 |
+
],
|
| 55 |
+
"type": "node",
|
| 56 |
+
"preLaunchTask": "Fullstack:prepare",
|
| 57 |
+
"killBehavior": "polite"
|
| 58 |
+
},
|
| 59 |
+
]
|
| 60 |
+
}
|
.vscode/settings.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"editor.wordWrap": "on",
|
| 3 |
+
"editor.wordWrapColumn": 120,
|
| 4 |
+
"files.trimTrailingWhitespace": true,
|
| 5 |
+
"files.trimFinalNewlines": true,
|
| 6 |
+
"[javascript]": {
|
| 7 |
+
"editor.defaultFormatter": "vscode.typescript-language-features"
|
| 8 |
+
},
|
| 9 |
+
"[jsonc]": {
|
| 10 |
+
"editor.defaultFormatter": "vscode.json-language-features"
|
| 11 |
+
},
|
| 12 |
+
"[typescript]": {
|
| 13 |
+
"editor.defaultFormatter": "vscode.typescript-language-features"
|
| 14 |
+
},
|
| 15 |
+
"[json]": {
|
| 16 |
+
"editor.defaultFormatter": "vscode.json-language-features"
|
| 17 |
+
},
|
| 18 |
+
"[yaml]": {
|
| 19 |
+
"editor.defaultFormatter": "redhat.vscode-yaml"
|
| 20 |
+
},
|
| 21 |
+
"[markdown]": {
|
| 22 |
+
"files.trimTrailingWhitespace": false
|
| 23 |
+
},
|
| 24 |
+
"typescript.tsdk": "node_modules/typescript/lib",
|
| 25 |
+
"typescript.preferences.quoteStyle": "single",
|
| 26 |
+
"typescript.format.semicolons": "insert",
|
| 27 |
+
"typescript.preferences.importModuleSpecifier": "project-relative",
|
| 28 |
+
"typescript.locale": "en",
|
| 29 |
+
"cSpell.enabled": true,
|
| 30 |
+
"cSpell.words": [
|
| 31 |
+
],
|
| 32 |
+
}
|
.vscode/tasks.json
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "2.0.0",
|
| 3 |
+
"tasks": [
|
| 4 |
+
{
|
| 5 |
+
"type": "npm",
|
| 6 |
+
"script": "build",
|
| 7 |
+
"group": "build",
|
| 8 |
+
"options": {
|
| 9 |
+
"cwd": "${workspaceFolder}/backend/functions"
|
| 10 |
+
},
|
| 11 |
+
"problemMatcher": [],
|
| 12 |
+
"label": "Backend:rebuild",
|
| 13 |
+
"detail": "Backend:rebuild"
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"type": "npm",
|
| 17 |
+
"script": "emu:reset",
|
| 18 |
+
"group": "build",
|
| 19 |
+
"options": {
|
| 20 |
+
"cwd": "${workspaceFolder}/backend/functions"
|
| 21 |
+
},
|
| 22 |
+
"problemMatcher": [],
|
| 23 |
+
"label": "Backend:reset-emulator",
|
| 24 |
+
"detail": "Backend:reset-emulator"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"type": "typescript",
|
| 28 |
+
"options": {
|
| 29 |
+
"cwd": "${workspaceFolder}/backend/functions"
|
| 30 |
+
},
|
| 31 |
+
"tsconfig": "backend/functions/tsconfig.json",
|
| 32 |
+
"option": "watch",
|
| 33 |
+
"isBackground": true,
|
| 34 |
+
"problemMatcher": [
|
| 35 |
+
"$tsc-watch"
|
| 36 |
+
],
|
| 37 |
+
"group": "build",
|
| 38 |
+
"label": "Backend:build:watch"
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"type": "npm",
|
| 42 |
+
"script": "emu:debug",
|
| 43 |
+
"group": "none",
|
| 44 |
+
"options": {
|
| 45 |
+
"cwd": "${workspaceFolder}/backend/functions"
|
| 46 |
+
},
|
| 47 |
+
"problemMatcher": [
|
| 48 |
+
{
|
| 49 |
+
"base": "$tsc",
|
| 50 |
+
"background": {
|
| 51 |
+
"activeOnStart": false,
|
| 52 |
+
"beginsPattern": "shutdown requested|Starting emulators",
|
| 53 |
+
"endsPattern": "Debugger listening"
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"label": "Backend:start-emulator-debug",
|
| 58 |
+
"detail": "Backend:start-emulator-debug",
|
| 59 |
+
"dependsOn": [
|
| 60 |
+
"Backend:build:watch"
|
| 61 |
+
],
|
| 62 |
+
"isBackground": true,
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"type": "npm",
|
| 66 |
+
"script": "dev",
|
| 67 |
+
"options": {
|
| 68 |
+
"cwd": "${workspaceFolder}/webapp",
|
| 69 |
+
},
|
| 70 |
+
"group": "build",
|
| 71 |
+
"label": "Frontend:start:dev",
|
| 72 |
+
"detail": "Frontend:start:dev",
|
| 73 |
+
"isBackground": true,
|
| 74 |
+
"problemMatcher": {
|
| 75 |
+
"base": "$vite",
|
| 76 |
+
"background": {
|
| 77 |
+
"activeOnStart": true,
|
| 78 |
+
"endsPattern": "OK",
|
| 79 |
+
"beginsPattern": "vite"
|
| 80 |
+
}
|
| 81 |
+
},
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"type": "npm",
|
| 85 |
+
"script": "dev",
|
| 86 |
+
"options": {
|
| 87 |
+
"cwd": "${workspaceFolder}/webapp",
|
| 88 |
+
"env": {
|
| 89 |
+
"FIREBASE_EMULATE": "true",
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
"group": "build",
|
| 93 |
+
"label": "Frontend:start:emu",
|
| 94 |
+
"detail": "Frontend:start:emu",
|
| 95 |
+
"isBackground": true,
|
| 96 |
+
"problemMatcher": {
|
| 97 |
+
"base": "$vite",
|
| 98 |
+
"background": {
|
| 99 |
+
"activeOnStart": true,
|
| 100 |
+
"endsPattern": "OK",
|
| 101 |
+
"beginsPattern": "vite"
|
| 102 |
+
}
|
| 103 |
+
},
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"type": "npm",
|
| 107 |
+
"script": "emu:debug2",
|
| 108 |
+
"group": "none",
|
| 109 |
+
"options": {
|
| 110 |
+
"cwd": "${workspaceFolder}/backend/functions",
|
| 111 |
+
"env": {
|
| 112 |
+
"https_proxy": "http://127.0.0.1:7890",
|
| 113 |
+
"http_proxy": "http://127.0.0.1:7890",
|
| 114 |
+
"all_proxy": "socks5://127.0.0.1:7890"
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"problemMatcher": [
|
| 118 |
+
{
|
| 119 |
+
"base": "$tsc",
|
| 120 |
+
"background": {
|
| 121 |
+
"activeOnStart": false,
|
| 122 |
+
"beginsPattern": "shutdown requested|Starting emulators",
|
| 123 |
+
"endsPattern": "Debugger listening"
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
],
|
| 127 |
+
"label": "Backend:start-emulator-debug:with-proxy",
|
| 128 |
+
"detail": "Backend:start-emulator-debug:with-proxy",
|
| 129 |
+
"dependsOn": [
|
| 130 |
+
"Backend:build:watch"
|
| 131 |
+
],
|
| 132 |
+
"isBackground": true,
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"label": "Fullstack:prepare",
|
| 136 |
+
"dependsOn": [
|
| 137 |
+
"Frontend:start:emu",
|
| 138 |
+
"Backend:build:watch",
|
| 139 |
+
],
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"label": "Fullstack:debug",
|
| 143 |
+
"dependsOn": [
|
| 144 |
+
// "Frontend:start:emu",
|
| 145 |
+
"Backend:start-emulator-debug",
|
| 146 |
+
],
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"label": "Fullstack:debug:with-proxy",
|
| 150 |
+
"dependsOn": [
|
| 151 |
+
"Frontend:start:emu",
|
| 152 |
+
"Backend:start-emulator-debug:with-proxy",
|
| 153 |
+
],
|
| 154 |
+
}
|
| 155 |
+
]
|
| 156 |
+
}
|
backend/functions/.puppeteerrc.cjs
CHANGED
|
@@ -1,14 +1,9 @@
|
|
| 1 |
const { join } = require('path');
|
| 2 |
|
| 3 |
-
let config = {};
|
| 4 |
-
if (!process.env.FUNCTIONS_EMULATOR) {
|
| 5 |
-
config = {
|
| 6 |
-
// Changes the cache location for Puppeteer.
|
| 7 |
-
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
| 8 |
-
};
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
/**
|
| 12 |
* @type {import("puppeteer").Configuration}
|
| 13 |
*/
|
| 14 |
-
module.exports =
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
const { join } = require('path');
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
/**
|
| 4 |
* @type {import("puppeteer").Configuration}
|
| 5 |
*/
|
| 6 |
+
module.exports = {
|
| 7 |
+
// Changes the cache location for Puppeteer.
|
| 8 |
+
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
| 9 |
+
};
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -3,9 +3,10 @@ import { singleton } from 'tsyringe';
|
|
| 3 |
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 4 |
import _ from 'lodash';
|
| 5 |
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
|
| 6 |
-
import TurnDownService from 'turndown';
|
| 7 |
import { Request, Response } from 'express';
|
| 8 |
import normalizeUrl from "@esm2cjs/normalize-url";
|
|
|
|
|
|
|
| 9 |
|
| 10 |
function tidyMarkdown(markdown: string): string {
|
| 11 |
|
|
@@ -50,11 +51,14 @@ function tidyMarkdown(markdown: string): string {
|
|
| 50 |
export class CrawlerHost extends RPCHost {
|
| 51 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
|
| 55 |
constructor(
|
| 56 |
protected globalLogger: Logger,
|
| 57 |
protected puppeteerControl: PuppeteerControl,
|
|
|
|
| 58 |
) {
|
| 59 |
super(...arguments);
|
| 60 |
}
|
|
@@ -65,14 +69,57 @@ export class CrawlerHost extends RPCHost {
|
|
| 65 |
this.emit('ready');
|
| 66 |
}
|
| 67 |
|
| 68 |
-
formatSnapshot(snapshot: PageSnapshot) {
|
| 69 |
-
|
| 70 |
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
const formatted = {
|
| 78 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
|
@@ -142,7 +189,7 @@ ${this.content}
|
|
| 142 |
continue;
|
| 143 |
}
|
| 144 |
|
| 145 |
-
const formatted = this.formatSnapshot(scrapped);
|
| 146 |
|
| 147 |
if (scrapped.screenshot && screenshotEnabled) {
|
| 148 |
sseStream.write({
|
|
@@ -177,7 +224,7 @@ ${this.content}
|
|
| 177 |
continue;
|
| 178 |
}
|
| 179 |
|
| 180 |
-
const formatted = this.formatSnapshot(scrapped);
|
| 181 |
|
| 182 |
return formatted;
|
| 183 |
}
|
|
@@ -186,7 +233,7 @@ ${this.content}
|
|
| 186 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 187 |
}
|
| 188 |
|
| 189 |
-
return this.formatSnapshot(lastScrapped);
|
| 190 |
}
|
| 191 |
|
| 192 |
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
|
@@ -195,7 +242,7 @@ ${this.content}
|
|
| 195 |
continue;
|
| 196 |
}
|
| 197 |
|
| 198 |
-
const formatted = this.formatSnapshot(scrapped);
|
| 199 |
|
| 200 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 201 |
}
|
|
@@ -204,8 +251,12 @@ ${this.content}
|
|
| 204 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 205 |
}
|
| 206 |
|
| 207 |
-
return `${this.formatSnapshot(lastScrapped)}`;
|
| 208 |
}
|
| 209 |
|
| 210 |
|
| 211 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 4 |
import _ from 'lodash';
|
| 5 |
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
|
|
|
|
| 6 |
import { Request, Response } from 'express';
|
| 7 |
import normalizeUrl from "@esm2cjs/normalize-url";
|
| 8 |
+
import { AltTextService } from '../services/alt-text';
|
| 9 |
+
import TurndownService from 'turndown';
|
| 10 |
|
| 11 |
function tidyMarkdown(markdown: string): string {
|
| 12 |
|
|
|
|
| 51 |
export class CrawlerHost extends RPCHost {
|
| 52 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 53 |
|
| 54 |
+
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
|
| 55 |
+
|
| 56 |
+
imageShortUrlPrefix?: string;
|
| 57 |
|
| 58 |
constructor(
|
| 59 |
protected globalLogger: Logger,
|
| 60 |
protected puppeteerControl: PuppeteerControl,
|
| 61 |
+
protected altTextService: AltTextService,
|
| 62 |
) {
|
| 63 |
super(...arguments);
|
| 64 |
}
|
|
|
|
| 69 |
this.emit('ready');
|
| 70 |
}
|
| 71 |
|
| 72 |
+
async formatSnapshot(snapshot: PageSnapshot) {
|
|
|
|
| 73 |
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 74 |
+
let turnDownService = new TurndownService();
|
| 75 |
+
for (const plugin of this.turnDownPlugins) {
|
| 76 |
+
turnDownService = turnDownService.use(plugin);
|
| 77 |
+
}
|
| 78 |
|
| 79 |
+
let contentText = '';
|
| 80 |
+
if (toBeTurnedToMd) {
|
| 81 |
+
const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {};
|
| 82 |
+
const tasks = (snapshot.imgs || []).map(async (x) => {
|
| 83 |
+
const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> {
|
| 84 |
+
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 85 |
+
return undefined;
|
| 86 |
+
});
|
| 87 |
+
if (r) {
|
| 88 |
+
urlToAltMap[x.src.trim()] = r;
|
| 89 |
+
}
|
| 90 |
+
});
|
| 91 |
|
| 92 |
+
await Promise.all(tasks);
|
| 93 |
+
let imgIdx = 0;
|
| 94 |
+
|
| 95 |
+
turnDownService.addRule('img-generated-alt', {
|
| 96 |
+
filter: 'img',
|
| 97 |
+
replacement: (_content, node) => {
|
| 98 |
+
const src = (node.getAttribute('src') || '').trim();
|
| 99 |
+
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 100 |
+
if (!src) {
|
| 101 |
+
return '';
|
| 102 |
+
}
|
| 103 |
+
const mapped = urlToAltMap[src];
|
| 104 |
+
imgIdx++;
|
| 105 |
+
if (mapped) {
|
| 106 |
+
return ``;
|
| 107 |
+
}
|
| 108 |
+
return ``;
|
| 109 |
+
}
|
| 110 |
+
});
|
| 111 |
+
|
| 112 |
+
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
| 116 |
+
contentText = turnDownService.turndown(snapshot.html);
|
| 117 |
+
}
|
| 118 |
+
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
| 119 |
+
contentText = snapshot.text;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
const cleanText = tidyMarkdown(contentText || '').trim();
|
| 123 |
|
| 124 |
const formatted = {
|
| 125 |
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
|
|
|
| 189 |
continue;
|
| 190 |
}
|
| 191 |
|
| 192 |
+
const formatted = await this.formatSnapshot(scrapped);
|
| 193 |
|
| 194 |
if (scrapped.screenshot && screenshotEnabled) {
|
| 195 |
sseStream.write({
|
|
|
|
| 224 |
continue;
|
| 225 |
}
|
| 226 |
|
| 227 |
+
const formatted = await this.formatSnapshot(scrapped);
|
| 228 |
|
| 229 |
return formatted;
|
| 230 |
}
|
|
|
|
| 233 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 234 |
}
|
| 235 |
|
| 236 |
+
return await this.formatSnapshot(lastScrapped);
|
| 237 |
}
|
| 238 |
|
| 239 |
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
|
|
|
| 242 |
continue;
|
| 243 |
}
|
| 244 |
|
| 245 |
+
const formatted = await this.formatSnapshot(scrapped);
|
| 246 |
|
| 247 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 248 |
}
|
|
|
|
| 251 |
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
| 252 |
}
|
| 253 |
|
| 254 |
+
return `${await this.formatSnapshot(lastScrapped)}`;
|
| 255 |
}
|
| 256 |
|
| 257 |
|
| 258 |
}
|
| 259 |
+
|
| 260 |
+
function cleanAttribute(attribute: string) {
|
| 261 |
+
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
|
| 262 |
+
}
|
backend/functions/src/db/img-alt.ts
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Also, Prop } from 'civkit';
|
| 2 |
+
import { FirestoreRecord } from '../shared/lib/firestore';
|
| 3 |
+
import _ from 'lodash';
|
| 4 |
+
|
| 5 |
+
@Also({
|
| 6 |
+
dictOf: Object
|
| 7 |
+
})
|
| 8 |
+
export class ImgAlt extends FirestoreRecord {
|
| 9 |
+
static override collectionName = 'imgAlts';
|
| 10 |
+
|
| 11 |
+
override _id!: string;
|
| 12 |
+
|
| 13 |
+
@Prop({
|
| 14 |
+
required: true
|
| 15 |
+
})
|
| 16 |
+
src!: string;
|
| 17 |
+
|
| 18 |
+
@Prop({
|
| 19 |
+
required: true
|
| 20 |
+
})
|
| 21 |
+
urlDigest!: string;
|
| 22 |
+
|
| 23 |
+
@Prop()
|
| 24 |
+
width?: number;
|
| 25 |
+
|
| 26 |
+
@Prop()
|
| 27 |
+
height?: number;
|
| 28 |
+
|
| 29 |
+
@Prop()
|
| 30 |
+
generatedAlt?: string;
|
| 31 |
+
|
| 32 |
+
@Prop()
|
| 33 |
+
originalAlt?: string;
|
| 34 |
+
|
| 35 |
+
@Prop()
|
| 36 |
+
createdAt!: Date;
|
| 37 |
+
|
| 38 |
+
@Prop()
|
| 39 |
+
expireAt?: Date;
|
| 40 |
+
|
| 41 |
+
[k: string]: any;
|
| 42 |
+
}
|
backend/functions/src/services/alt-text.ts
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
| 2 |
+
import { singleton } from 'tsyringe';
|
| 3 |
+
import { Logger } from '../shared/services/logger';
|
| 4 |
+
import { CanvasService } from '../shared/services/canvas';
|
| 5 |
+
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
| 6 |
+
import { ImgBrief } from './puppeteer';
|
| 7 |
+
import { ImgAlt } from '../db/img-alt';
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
const md5Hasher = new HashManager('md5', 'hex');
|
| 11 |
+
|
| 12 |
+
@singleton()
|
| 13 |
+
export class AltTextService extends AsyncService {
|
| 14 |
+
|
| 15 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 16 |
+
|
| 17 |
+
constructor(
|
| 18 |
+
protected globalLogger: Logger,
|
| 19 |
+
protected imageInterrogator: ImageInterrogationManager,
|
| 20 |
+
protected canvasService: CanvasService
|
| 21 |
+
) {
|
| 22 |
+
super(...arguments);
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
override async init() {
|
| 26 |
+
await this.dependencyReady();
|
| 27 |
+
this.emit('ready');
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
async caption(url: string) {
|
| 31 |
+
try {
|
| 32 |
+
const img = await this.canvasService.loadImage(url);
|
| 33 |
+
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
| 34 |
+
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
| 35 |
+
|
| 36 |
+
const r = await this.imageInterrogator.interrogate('blip2', {
|
| 37 |
+
image: exported,
|
| 38 |
+
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
|
| 39 |
+
});
|
| 40 |
+
|
| 41 |
+
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
|
| 42 |
+
} catch (err) {
|
| 43 |
+
throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err });
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
async getAltTextAndShortDigest(imgBrief: ImgBrief) {
|
| 48 |
+
if (!imgBrief.src) {
|
| 49 |
+
return undefined;
|
| 50 |
+
}
|
| 51 |
+
const digest = md5Hasher.hash(imgBrief.src);
|
| 52 |
+
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
| 53 |
+
|
| 54 |
+
const existing = await ImgAlt.fromFirestore(shortDigest);
|
| 55 |
+
|
| 56 |
+
if (existing?.generatedAlt) {
|
| 57 |
+
return {
|
| 58 |
+
shortDigest,
|
| 59 |
+
alt: existing.generatedAlt,
|
| 60 |
+
};
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
let generatedCaption;
|
| 64 |
+
|
| 65 |
+
if (!imgBrief.alt) {
|
| 66 |
+
try {
|
| 67 |
+
generatedCaption = await this.caption(imgBrief.src);
|
| 68 |
+
} catch (err) {
|
| 69 |
+
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
| 74 |
+
{
|
| 75 |
+
_id: shortDigest,
|
| 76 |
+
src: imgBrief.src || '',
|
| 77 |
+
width: imgBrief.naturalWidth || 0,
|
| 78 |
+
height: imgBrief.naturalHeight || 0,
|
| 79 |
+
urlDigest: digest,
|
| 80 |
+
originalAlt: imgBrief.alt || '',
|
| 81 |
+
generatedAlt: generatedCaption || '',
|
| 82 |
+
createdAt: new Date()
|
| 83 |
+
}, { merge: true }
|
| 84 |
+
);
|
| 85 |
+
|
| 86 |
+
return {
|
| 87 |
+
shortDigest,
|
| 88 |
+
alt: generatedCaption,
|
| 89 |
+
};
|
| 90 |
+
}
|
| 91 |
+
}
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -7,11 +7,19 @@ import os from 'os';
|
|
| 7 |
import fs from 'fs';
|
| 8 |
import { Crawled } from '../db/crawled';
|
| 9 |
import puppeteer from 'puppeteer-extra';
|
| 10 |
-
import puppeteerStealth from 'puppeteer-extra-plugin-stealth';
|
| 11 |
-
|
| 12 |
|
| 13 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
export interface PageSnapshot {
|
| 16 |
title: string;
|
| 17 |
href: string;
|
|
@@ -30,13 +38,16 @@ export interface PageSnapshot {
|
|
| 30 |
publishedTime: string;
|
| 31 |
} | null;
|
| 32 |
screenshot?: Buffer;
|
|
|
|
| 33 |
}
|
| 34 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 35 |
|
|
|
|
| 36 |
puppeteer.use(puppeteerStealth());
|
| 37 |
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
|
| 38 |
// puppeteer.use(puppeteerUAOverride({
|
| 39 |
-
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
|
|
|
|
| 40 |
// }))
|
| 41 |
|
| 42 |
@singleton()
|
|
@@ -84,7 +95,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 84 |
this.browser = await puppeteer.launch({
|
| 85 |
headless: true,
|
| 86 |
timeout: 10_000
|
| 87 |
-
}).catch((err) => {
|
| 88 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
| 89 |
process.nextTick(() => {
|
| 90 |
this.emit('error', err);
|
|
@@ -117,23 +128,42 @@ export class PuppeteerControl extends AsyncService {
|
|
| 117 |
}));
|
| 118 |
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
| 119 |
preparations.push(page.evaluateOnNewDocument(`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
function giveSnapshot() {
|
| 121 |
-
let
|
| 122 |
try {
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
// If an error occurs, log it and set parsedContent to undefined
|
| 127 |
-
parsedContent = undefined;
|
| 128 |
}
|
| 129 |
|
| 130 |
-
|
| 131 |
title: document.title,
|
| 132 |
href: document.location.href,
|
| 133 |
html: document.documentElement.outerHTML,
|
| 134 |
text: document.body.innerText,
|
| 135 |
-
parsed:
|
|
|
|
| 136 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
}
|
| 138 |
`));
|
| 139 |
preparations.push(page.evaluateOnNewDocument(() => {
|
|
|
|
| 7 |
import fs from 'fs';
|
| 8 |
import { Crawled } from '../db/crawled';
|
| 9 |
import puppeteer from 'puppeteer-extra';
|
|
|
|
|
|
|
| 10 |
|
| 11 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 12 |
|
| 13 |
+
export interface ImgBrief {
|
| 14 |
+
src: string;
|
| 15 |
+
loaded: boolean;
|
| 16 |
+
width: number;
|
| 17 |
+
height: number;
|
| 18 |
+
naturalWidth: number;
|
| 19 |
+
naturalHeight: number;
|
| 20 |
+
alt?: string;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
export interface PageSnapshot {
|
| 24 |
title: string;
|
| 25 |
href: string;
|
|
|
|
| 38 |
publishedTime: string;
|
| 39 |
} | null;
|
| 40 |
screenshot?: Buffer;
|
| 41 |
+
imgs?: ImgBrief[];
|
| 42 |
}
|
| 43 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 44 |
|
| 45 |
+
const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
|
| 46 |
puppeteer.use(puppeteerStealth());
|
| 47 |
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
|
| 48 |
// puppeteer.use(puppeteerUAOverride({
|
| 49 |
+
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
|
| 50 |
+
// platform: `Linux`,
|
| 51 |
// }))
|
| 52 |
|
| 53 |
@singleton()
|
|
|
|
| 95 |
this.browser = await puppeteer.launch({
|
| 96 |
headless: true,
|
| 97 |
timeout: 10_000
|
| 98 |
+
}).catch((err: any) => {
|
| 99 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
| 100 |
process.nextTick(() => {
|
| 101 |
this.emit('error', err);
|
|
|
|
| 128 |
}));
|
| 129 |
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
| 130 |
preparations.push(page.evaluateOnNewDocument(`
|
| 131 |
+
function briefImgs(elem) {
|
| 132 |
+
const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
|
| 133 |
+
|
| 134 |
+
return imageTags.map((x)=> ({
|
| 135 |
+
src: x.src,
|
| 136 |
+
loaded: x.complete,
|
| 137 |
+
width: x.width,
|
| 138 |
+
height: x.height,
|
| 139 |
+
naturalWidth: x.naturalWidth,
|
| 140 |
+
naturalHeight: x.naturalHeight,
|
| 141 |
+
alt: x.alt || x.title,
|
| 142 |
+
}));
|
| 143 |
+
}
|
| 144 |
function giveSnapshot() {
|
| 145 |
+
let parsed;
|
| 146 |
try {
|
| 147 |
+
parsed = new Readability(document.cloneNode(true)).parse();
|
| 148 |
+
} catch (err) {
|
| 149 |
+
void 0;
|
|
|
|
|
|
|
| 150 |
}
|
| 151 |
|
| 152 |
+
const r = {
|
| 153 |
title: document.title,
|
| 154 |
href: document.location.href,
|
| 155 |
html: document.documentElement.outerHTML,
|
| 156 |
text: document.body.innerText,
|
| 157 |
+
parsed: parsed,
|
| 158 |
+
imgs: [],
|
| 159 |
};
|
| 160 |
+
if (parsed && parsed.content) {
|
| 161 |
+
const elem = document.createElement('div');
|
| 162 |
+
elem.innerHTML = parsed.content;
|
| 163 |
+
r.imgs = briefImgs(elem);
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
return r;
|
| 167 |
}
|
| 168 |
`));
|
| 169 |
preparations.push(page.evaluateOnNewDocument(() => {
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit bea967a371581c1109dc0101dbcab196e9ed9ade
|