Spaces:
Build error
Build error
fix: performance issue of jsdom
Browse files- backend/functions/package-lock.json +267 -392
- backend/functions/package.json +5 -5
- backend/functions/src/cloud-functions/crawler.ts +16 -490
- backend/functions/src/cloud-functions/data-crunching.ts +4 -2
- backend/functions/src/cloud-functions/searcher.ts +4 -2
- backend/functions/src/services/jsdom.ts +29 -20
- backend/functions/src/services/puppeteer.ts +9 -49
- backend/functions/src/services/snapshot-formatter.ts +539 -0
backend/functions/package-lock.json
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
"archiver": "^6.0.1",
|
| 15 |
"axios": "^1.3.3",
|
| 16 |
"bcrypt": "^5.1.0",
|
| 17 |
-
"civkit": "^0.
|
| 18 |
"core-js": "^3.37.1",
|
| 19 |
"cors": "^2.8.5",
|
| 20 |
"dayjs": "^1.11.9",
|
|
@@ -23,13 +23,13 @@
|
|
| 23 |
"firebase-functions": "^4.9.0",
|
| 24 |
"htmlparser2": "^9.0.0",
|
| 25 |
"jose": "^5.1.0",
|
| 26 |
-
"jsdom": "^24.0.0",
|
| 27 |
"langdetect": "^0.2.1",
|
|
|
|
| 28 |
"maxmind": "^4.3.18",
|
| 29 |
"minio": "^7.1.3",
|
| 30 |
"openai": "^4.20.0",
|
| 31 |
"pdfjs-dist": "^4.2.67",
|
| 32 |
-
"puppeteer": "^
|
| 33 |
"puppeteer-extra": "^3.3.6",
|
| 34 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 35 |
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
|
@@ -48,7 +48,7 @@
|
|
| 48 |
"@types/bcrypt": "^5.0.0",
|
| 49 |
"@types/cors": "^2.8.17",
|
| 50 |
"@types/generic-pool": "^3.8.1",
|
| 51 |
-
"@types/node": "^
|
| 52 |
"@types/set-cookie-parser": "^2.4.7",
|
| 53 |
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
| 54 |
"@typescript-eslint/parser": "^5.12.0",
|
|
@@ -57,7 +57,7 @@
|
|
| 57 |
"eslint-plugin-import": "^2.25.4",
|
| 58 |
"firebase-functions-test": "^3.0.0",
|
| 59 |
"replicate": "^0.16.1",
|
| 60 |
-
"typescript": "^5.
|
| 61 |
},
|
| 62 |
"engines": {
|
| 63 |
"node": "20"
|
|
@@ -1564,10 +1564,9 @@
|
|
| 1564 |
}
|
| 1565 |
},
|
| 1566 |
"node_modules/@mongodb-js/saslprep": {
|
| 1567 |
-
"version": "1.1.
|
| 1568 |
-
"resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.1.
|
| 1569 |
-
"integrity": "sha512-
|
| 1570 |
-
"optional": true,
|
| 1571 |
"peer": true,
|
| 1572 |
"dependencies": {
|
| 1573 |
"sparse-bitfield": "^3.0.3"
|
|
@@ -1977,18 +1976,18 @@
|
|
| 1977 |
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
|
| 1978 |
},
|
| 1979 |
"node_modules/@puppeteer/browsers": {
|
| 1980 |
-
"version": "2.
|
| 1981 |
-
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.
|
| 1982 |
-
"integrity": "sha512-
|
| 1983 |
-
"dependencies": {
|
| 1984 |
-
"debug": "4.3.
|
| 1985 |
-
"extract-zip": "2.0.1",
|
| 1986 |
-
"progress": "2.0.3",
|
| 1987 |
-
"proxy-agent": "6.4.0",
|
| 1988 |
-
"semver": "7.6.
|
| 1989 |
-
"tar-fs": "3.0.
|
| 1990 |
-
"unbzip2-stream": "1.4.3",
|
| 1991 |
-
"yargs": "17.7.2"
|
| 1992 |
},
|
| 1993 |
"bin": {
|
| 1994 |
"browsers": "lib/cjs/main-cli.js"
|
|
@@ -2299,9 +2298,9 @@
|
|
| 2299 |
"integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
|
| 2300 |
},
|
| 2301 |
"node_modules/@types/node": {
|
| 2302 |
-
"version": "
|
| 2303 |
-
"resolved": "https://registry.npmjs.org/@types/node/-/node-
|
| 2304 |
-
"integrity": "sha512-
|
| 2305 |
"dependencies": {
|
| 2306 |
"undici-types": "~5.26.4"
|
| 2307 |
}
|
|
@@ -2424,12 +2423,11 @@
|
|
| 2424 |
"peer": true
|
| 2425 |
},
|
| 2426 |
"node_modules/@types/whatwg-url": {
|
| 2427 |
-
"version": "
|
| 2428 |
-
"resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-
|
| 2429 |
-
"integrity": "sha512-
|
| 2430 |
"peer": true,
|
| 2431 |
"dependencies": {
|
| 2432 |
-
"@types/node": "*",
|
| 2433 |
"@types/webidl-conversions": "*"
|
| 2434 |
}
|
| 2435 |
},
|
|
@@ -3227,31 +3225,41 @@
|
|
| 3227 |
"optional": true
|
| 3228 |
},
|
| 3229 |
"node_modules/bare-fs": {
|
| 3230 |
-
"version": "2.
|
| 3231 |
-
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-2.
|
| 3232 |
-
"integrity": "sha512-
|
| 3233 |
"optional": true,
|
| 3234 |
"dependencies": {
|
| 3235 |
"bare-events": "^2.0.0",
|
| 3236 |
"bare-path": "^2.0.0",
|
| 3237 |
-
"
|
| 3238 |
}
|
| 3239 |
},
|
| 3240 |
"node_modules/bare-os": {
|
| 3241 |
-
"version": "2.
|
| 3242 |
-
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.2.
|
| 3243 |
-
"integrity": "sha512-
|
| 3244 |
"optional": true
|
| 3245 |
},
|
| 3246 |
"node_modules/bare-path": {
|
| 3247 |
-
"version": "2.1.
|
| 3248 |
-
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-2.1.
|
| 3249 |
-
"integrity": "sha512-
|
| 3250 |
"optional": true,
|
| 3251 |
"dependencies": {
|
| 3252 |
"bare-os": "^2.1.0"
|
| 3253 |
}
|
| 3254 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3255 |
"node_modules/base32.js": {
|
| 3256 |
"version": "0.1.0",
|
| 3257 |
"resolved": "https://registry.npmjs.org/base32.js/-/base32.js-0.1.0.tgz",
|
|
@@ -3374,6 +3382,11 @@
|
|
| 3374 |
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
| 3375 |
"integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
|
| 3376 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3377 |
"node_modules/brace-expansion": {
|
| 3378 |
"version": "1.1.11",
|
| 3379 |
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
|
|
@@ -3444,12 +3457,12 @@
|
|
| 3444 |
}
|
| 3445 |
},
|
| 3446 |
"node_modules/bson": {
|
| 3447 |
-
"version": "
|
| 3448 |
-
"resolved": "https://registry.npmjs.org/bson/-/bson-
|
| 3449 |
-
"integrity": "sha512-
|
| 3450 |
"peer": true,
|
| 3451 |
"engines": {
|
| 3452 |
-
"node": ">=
|
| 3453 |
}
|
| 3454 |
},
|
| 3455 |
"node_modules/buffer": {
|
|
@@ -3659,13 +3672,13 @@
|
|
| 3659 |
}
|
| 3660 |
},
|
| 3661 |
"node_modules/chromium-bidi": {
|
| 3662 |
-
"version": "0.
|
| 3663 |
-
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.5.
|
| 3664 |
-
"integrity": "sha512-
|
| 3665 |
"dependencies": {
|
| 3666 |
"mitt": "3.0.1",
|
| 3667 |
"urlpattern-polyfill": "10.0.0",
|
| 3668 |
-
"zod": "3.
|
| 3669 |
},
|
| 3670 |
"peerDependencies": {
|
| 3671 |
"devtools-protocol": "*"
|
|
@@ -3688,9 +3701,9 @@
|
|
| 3688 |
}
|
| 3689 |
},
|
| 3690 |
"node_modules/civkit": {
|
| 3691 |
-
"version": "0.
|
| 3692 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.
|
| 3693 |
-
"integrity": "sha512-
|
| 3694 |
"dependencies": {
|
| 3695 |
"lodash": "^4.17.21",
|
| 3696 |
"tslib": "^2.5.0"
|
|
@@ -3719,11 +3732,13 @@
|
|
| 3719 |
"pino": "^8.11.0",
|
| 3720 |
"reflect-metadata": "^0.1.13",
|
| 3721 |
"smtp-server": "^3.11.0",
|
| 3722 |
-
"tld-extract": "^2.1.0"
|
|
|
|
|
|
|
| 3723 |
},
|
| 3724 |
"peerDependencies": {
|
| 3725 |
-
"mongodb": "^
|
| 3726 |
-
"tsyringe": "^4
|
| 3727 |
}
|
| 3728 |
},
|
| 3729 |
"node_modules/cjs-module-lexer": {
|
|
@@ -4049,17 +4064,37 @@
|
|
| 4049 |
"node": ">= 8"
|
| 4050 |
}
|
| 4051 |
},
|
| 4052 |
-
"node_modules/
|
| 4053 |
-
"version": "
|
| 4054 |
-
"resolved": "https://registry.npmjs.org/
|
| 4055 |
-
"integrity": "sha512-
|
| 4056 |
"dependencies": {
|
| 4057 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4058 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4059 |
"engines": {
|
| 4060 |
-
"node": ">=
|
|
|
|
|
|
|
|
|
|
| 4061 |
}
|
| 4062 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4063 |
"node_modules/data-uri-to-buffer": {
|
| 4064 |
"version": "6.0.2",
|
| 4065 |
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
|
@@ -4068,41 +4103,6 @@
|
|
| 4068 |
"node": ">= 14"
|
| 4069 |
}
|
| 4070 |
},
|
| 4071 |
-
"node_modules/data-urls": {
|
| 4072 |
-
"version": "5.0.0",
|
| 4073 |
-
"resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz",
|
| 4074 |
-
"integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==",
|
| 4075 |
-
"dependencies": {
|
| 4076 |
-
"whatwg-mimetype": "^4.0.0",
|
| 4077 |
-
"whatwg-url": "^14.0.0"
|
| 4078 |
-
},
|
| 4079 |
-
"engines": {
|
| 4080 |
-
"node": ">=18"
|
| 4081 |
-
}
|
| 4082 |
-
},
|
| 4083 |
-
"node_modules/data-urls/node_modules/tr46": {
|
| 4084 |
-
"version": "5.0.0",
|
| 4085 |
-
"resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
|
| 4086 |
-
"integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
|
| 4087 |
-
"dependencies": {
|
| 4088 |
-
"punycode": "^2.3.1"
|
| 4089 |
-
},
|
| 4090 |
-
"engines": {
|
| 4091 |
-
"node": ">=18"
|
| 4092 |
-
}
|
| 4093 |
-
},
|
| 4094 |
-
"node_modules/data-urls/node_modules/whatwg-url": {
|
| 4095 |
-
"version": "14.0.0",
|
| 4096 |
-
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
|
| 4097 |
-
"integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
|
| 4098 |
-
"dependencies": {
|
| 4099 |
-
"tr46": "^5.0.0",
|
| 4100 |
-
"webidl-conversions": "^7.0.0"
|
| 4101 |
-
},
|
| 4102 |
-
"engines": {
|
| 4103 |
-
"node": ">=18"
|
| 4104 |
-
}
|
| 4105 |
-
},
|
| 4106 |
"node_modules/data-view-buffer": {
|
| 4107 |
"version": "1.0.1",
|
| 4108 |
"resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
|
|
@@ -4160,11 +4160,11 @@
|
|
| 4160 |
"integrity": "sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ=="
|
| 4161 |
},
|
| 4162 |
"node_modules/debug": {
|
| 4163 |
-
"version": "4.3.
|
| 4164 |
-
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.
|
| 4165 |
-
"integrity": "sha512-
|
| 4166 |
"dependencies": {
|
| 4167 |
-
"ms": "2.1.
|
| 4168 |
},
|
| 4169 |
"engines": {
|
| 4170 |
"node": ">=6.0"
|
|
@@ -4175,11 +4175,6 @@
|
|
| 4175 |
}
|
| 4176 |
}
|
| 4177 |
},
|
| 4178 |
-
"node_modules/decimal.js": {
|
| 4179 |
-
"version": "10.4.3",
|
| 4180 |
-
"resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz",
|
| 4181 |
-
"integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA=="
|
| 4182 |
-
},
|
| 4183 |
"node_modules/decode-uri-component": {
|
| 4184 |
"version": "0.2.2",
|
| 4185 |
"resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
|
|
@@ -4359,9 +4354,9 @@
|
|
| 4359 |
}
|
| 4360 |
},
|
| 4361 |
"node_modules/devtools-protocol": {
|
| 4362 |
-
"version": "0.0.
|
| 4363 |
-
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.
|
| 4364 |
-
"integrity": "sha512-
|
| 4365 |
},
|
| 4366 |
"node_modules/diff-sequences": {
|
| 4367 |
"version": "29.6.3",
|
|
@@ -5461,14 +5456,6 @@
|
|
| 5461 |
"@google-cloud/storage": "^7.7.0"
|
| 5462 |
}
|
| 5463 |
},
|
| 5464 |
-
"node_modules/firebase-admin/node_modules/@types/node": {
|
| 5465 |
-
"version": "20.12.7",
|
| 5466 |
-
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz",
|
| 5467 |
-
"integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==",
|
| 5468 |
-
"dependencies": {
|
| 5469 |
-
"undici-types": "~5.26.4"
|
| 5470 |
-
}
|
| 5471 |
-
},
|
| 5472 |
"node_modules/firebase-functions": {
|
| 5473 |
"version": "4.9.0",
|
| 5474 |
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-4.9.0.tgz",
|
|
@@ -5793,15 +5780,33 @@
|
|
| 5793 |
}
|
| 5794 |
},
|
| 5795 |
"node_modules/gcp-metadata": {
|
| 5796 |
-
"version": "
|
| 5797 |
-
"resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-
|
| 5798 |
-
"integrity": "sha512-
|
|
|
|
|
|
|
| 5799 |
"dependencies": {
|
| 5800 |
-
"gaxios": "^
|
| 5801 |
"json-bigint": "^1.0.0"
|
| 5802 |
},
|
| 5803 |
"engines": {
|
| 5804 |
-
"node": ">=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5805 |
}
|
| 5806 |
},
|
| 5807 |
"node_modules/generic-pool": {
|
|
@@ -6023,6 +6028,18 @@
|
|
| 6023 |
"node": ">=14"
|
| 6024 |
}
|
| 6025 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6026 |
"node_modules/google-gax": {
|
| 6027 |
"version": "4.3.2",
|
| 6028 |
"resolved": "https://registry.npmjs.org/google-gax/-/google-gax-4.3.2.tgz",
|
|
@@ -6184,17 +6201,6 @@
|
|
| 6184 |
"node": ">= 0.4"
|
| 6185 |
}
|
| 6186 |
},
|
| 6187 |
-
"node_modules/html-encoding-sniffer": {
|
| 6188 |
-
"version": "4.0.0",
|
| 6189 |
-
"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
|
| 6190 |
-
"integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
|
| 6191 |
-
"dependencies": {
|
| 6192 |
-
"whatwg-encoding": "^3.1.1"
|
| 6193 |
-
},
|
| 6194 |
-
"engines": {
|
| 6195 |
-
"node": ">=18"
|
| 6196 |
-
}
|
| 6197 |
-
},
|
| 6198 |
"node_modules/html-escaper": {
|
| 6199 |
"version": "2.0.2",
|
| 6200 |
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
|
@@ -6383,6 +6389,7 @@
|
|
| 6383 |
"version": "0.6.3",
|
| 6384 |
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
| 6385 |
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
|
|
|
| 6386 |
"dependencies": {
|
| 6387 |
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
| 6388 |
},
|
|
@@ -6780,11 +6787,6 @@
|
|
| 6780 |
"node": ">=0.10.0"
|
| 6781 |
}
|
| 6782 |
},
|
| 6783 |
-
"node_modules/is-potential-custom-element-name": {
|
| 6784 |
-
"version": "1.0.1",
|
| 6785 |
-
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
| 6786 |
-
"integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="
|
| 6787 |
-
},
|
| 6788 |
"node_modules/is-regex": {
|
| 6789 |
"version": "1.1.4",
|
| 6790 |
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
|
|
@@ -7653,91 +7655,6 @@
|
|
| 7653 |
"node": ">=0.1.90"
|
| 7654 |
}
|
| 7655 |
},
|
| 7656 |
-
"node_modules/jsdom": {
|
| 7657 |
-
"version": "24.0.0",
|
| 7658 |
-
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz",
|
| 7659 |
-
"integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==",
|
| 7660 |
-
"dependencies": {
|
| 7661 |
-
"cssstyle": "^4.0.1",
|
| 7662 |
-
"data-urls": "^5.0.0",
|
| 7663 |
-
"decimal.js": "^10.4.3",
|
| 7664 |
-
"form-data": "^4.0.0",
|
| 7665 |
-
"html-encoding-sniffer": "^4.0.0",
|
| 7666 |
-
"http-proxy-agent": "^7.0.0",
|
| 7667 |
-
"https-proxy-agent": "^7.0.2",
|
| 7668 |
-
"is-potential-custom-element-name": "^1.0.1",
|
| 7669 |
-
"nwsapi": "^2.2.7",
|
| 7670 |
-
"parse5": "^7.1.2",
|
| 7671 |
-
"rrweb-cssom": "^0.6.0",
|
| 7672 |
-
"saxes": "^6.0.0",
|
| 7673 |
-
"symbol-tree": "^3.2.4",
|
| 7674 |
-
"tough-cookie": "^4.1.3",
|
| 7675 |
-
"w3c-xmlserializer": "^5.0.0",
|
| 7676 |
-
"webidl-conversions": "^7.0.0",
|
| 7677 |
-
"whatwg-encoding": "^3.1.1",
|
| 7678 |
-
"whatwg-mimetype": "^4.0.0",
|
| 7679 |
-
"whatwg-url": "^14.0.0",
|
| 7680 |
-
"ws": "^8.16.0",
|
| 7681 |
-
"xml-name-validator": "^5.0.0"
|
| 7682 |
-
},
|
| 7683 |
-
"engines": {
|
| 7684 |
-
"node": ">=18"
|
| 7685 |
-
},
|
| 7686 |
-
"peerDependencies": {
|
| 7687 |
-
"canvas": "^2.11.2"
|
| 7688 |
-
},
|
| 7689 |
-
"peerDependenciesMeta": {
|
| 7690 |
-
"canvas": {
|
| 7691 |
-
"optional": true
|
| 7692 |
-
}
|
| 7693 |
-
}
|
| 7694 |
-
},
|
| 7695 |
-
"node_modules/jsdom/node_modules/agent-base": {
|
| 7696 |
-
"version": "7.1.1",
|
| 7697 |
-
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz",
|
| 7698 |
-
"integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==",
|
| 7699 |
-
"dependencies": {
|
| 7700 |
-
"debug": "^4.3.4"
|
| 7701 |
-
},
|
| 7702 |
-
"engines": {
|
| 7703 |
-
"node": ">= 14"
|
| 7704 |
-
}
|
| 7705 |
-
},
|
| 7706 |
-
"node_modules/jsdom/node_modules/https-proxy-agent": {
|
| 7707 |
-
"version": "7.0.4",
|
| 7708 |
-
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz",
|
| 7709 |
-
"integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==",
|
| 7710 |
-
"dependencies": {
|
| 7711 |
-
"agent-base": "^7.0.2",
|
| 7712 |
-
"debug": "4"
|
| 7713 |
-
},
|
| 7714 |
-
"engines": {
|
| 7715 |
-
"node": ">= 14"
|
| 7716 |
-
}
|
| 7717 |
-
},
|
| 7718 |
-
"node_modules/jsdom/node_modules/tr46": {
|
| 7719 |
-
"version": "5.0.0",
|
| 7720 |
-
"resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
|
| 7721 |
-
"integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
|
| 7722 |
-
"dependencies": {
|
| 7723 |
-
"punycode": "^2.3.1"
|
| 7724 |
-
},
|
| 7725 |
-
"engines": {
|
| 7726 |
-
"node": ">=18"
|
| 7727 |
-
}
|
| 7728 |
-
},
|
| 7729 |
-
"node_modules/jsdom/node_modules/whatwg-url": {
|
| 7730 |
-
"version": "14.0.0",
|
| 7731 |
-
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
|
| 7732 |
-
"integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
|
| 7733 |
-
"dependencies": {
|
| 7734 |
-
"tr46": "^5.0.0",
|
| 7735 |
-
"webidl-conversions": "^7.0.0"
|
| 7736 |
-
},
|
| 7737 |
-
"engines": {
|
| 7738 |
-
"node": ">=18"
|
| 7739 |
-
}
|
| 7740 |
-
},
|
| 7741 |
"node_modules/jsesc": {
|
| 7742 |
"version": "2.5.2",
|
| 7743 |
"resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
|
|
@@ -8156,6 +8073,23 @@
|
|
| 8156 |
"resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
|
| 8157 |
"integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg=="
|
| 8158 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8159 |
"node_modules/locate-path": {
|
| 8160 |
"version": "6.0.0",
|
| 8161 |
"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
|
|
@@ -8335,7 +8269,6 @@
|
|
| 8335 |
"version": "1.5.0",
|
| 8336 |
"resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz",
|
| 8337 |
"integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==",
|
| 8338 |
-
"optional": true,
|
| 8339 |
"peer": true
|
| 8340 |
},
|
| 8341 |
"node_modules/merge-deep": {
|
|
@@ -8564,27 +8497,26 @@
|
|
| 8564 |
}
|
| 8565 |
},
|
| 8566 |
"node_modules/mongodb": {
|
| 8567 |
-
"version": "
|
| 8568 |
-
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-
|
| 8569 |
-
"integrity": "sha512-
|
| 8570 |
"peer": true,
|
| 8571 |
"dependencies": {
|
| 8572 |
-
"
|
| 8573 |
-
"
|
| 8574 |
-
"
|
| 8575 |
},
|
| 8576 |
"engines": {
|
| 8577 |
-
"node": ">=
|
| 8578 |
-
},
|
| 8579 |
-
"optionalDependencies": {
|
| 8580 |
-
"@mongodb-js/saslprep": "^1.1.0"
|
| 8581 |
},
|
| 8582 |
"peerDependencies": {
|
| 8583 |
"@aws-sdk/credential-providers": "^3.188.0",
|
| 8584 |
-
"@mongodb-js/zstd": "^1.
|
| 8585 |
-
"
|
| 8586 |
-
"
|
| 8587 |
-
"
|
|
|
|
|
|
|
| 8588 |
},
|
| 8589 |
"peerDependenciesMeta": {
|
| 8590 |
"@aws-sdk/credential-providers": {
|
|
@@ -8593,6 +8525,9 @@
|
|
| 8593 |
"@mongodb-js/zstd": {
|
| 8594 |
"optional": true
|
| 8595 |
},
|
|
|
|
|
|
|
|
|
|
| 8596 |
"kerberos": {
|
| 8597 |
"optional": true
|
| 8598 |
},
|
|
@@ -8601,23 +8536,26 @@
|
|
| 8601 |
},
|
| 8602 |
"snappy": {
|
| 8603 |
"optional": true
|
|
|
|
|
|
|
|
|
|
| 8604 |
}
|
| 8605 |
}
|
| 8606 |
},
|
| 8607 |
"node_modules/mongodb-connection-string-url": {
|
| 8608 |
-
"version": "
|
| 8609 |
-
"resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-
|
| 8610 |
-
"integrity": "sha512-
|
| 8611 |
"peer": true,
|
| 8612 |
"dependencies": {
|
| 8613 |
-
"@types/whatwg-url": "^
|
| 8614 |
-
"whatwg-url": "^
|
| 8615 |
}
|
| 8616 |
},
|
| 8617 |
"node_modules/ms": {
|
| 8618 |
-
"version": "2.1.
|
| 8619 |
-
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.
|
| 8620 |
-
"integrity": "sha512-
|
| 8621 |
},
|
| 8622 |
"node_modules/napi-build-utils": {
|
| 8623 |
"version": "1.0.2",
|
|
@@ -8836,10 +8774,16 @@
|
|
| 8836 |
"set-blocking": "^2.0.0"
|
| 8837 |
}
|
| 8838 |
},
|
| 8839 |
-
"node_modules/
|
| 8840 |
-
"version": "2.
|
| 8841 |
-
"resolved": "https://registry.npmjs.org/
|
| 8842 |
-
"integrity": "sha512-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8843 |
},
|
| 8844 |
"node_modules/object-assign": {
|
| 8845 |
"version": "4.1.1",
|
|
@@ -9009,6 +8953,14 @@
|
|
| 9009 |
"openai": "bin/cli"
|
| 9010 |
}
|
| 9011 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9012 |
"node_modules/optionator": {
|
| 9013 |
"version": "0.9.3",
|
| 9014 |
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
|
|
@@ -9155,17 +9107,6 @@
|
|
| 9155 |
"url": "https://github.com/sponsors/sindresorhus"
|
| 9156 |
}
|
| 9157 |
},
|
| 9158 |
-
"node_modules/parse5": {
|
| 9159 |
-
"version": "7.1.2",
|
| 9160 |
-
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
|
| 9161 |
-
"integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==",
|
| 9162 |
-
"dependencies": {
|
| 9163 |
-
"entities": "^4.4.0"
|
| 9164 |
-
},
|
| 9165 |
-
"funding": {
|
| 9166 |
-
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
| 9167 |
-
}
|
| 9168 |
-
},
|
| 9169 |
"node_modules/parseurl": {
|
| 9170 |
"version": "1.3.3",
|
| 9171 |
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
|
@@ -9691,33 +9632,36 @@
|
|
| 9691 |
}
|
| 9692 |
},
|
| 9693 |
"node_modules/puppeteer": {
|
| 9694 |
-
"version": "
|
| 9695 |
-
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-
|
| 9696 |
-
"integrity": "sha512-
|
| 9697 |
"hasInstallScript": true,
|
| 9698 |
"dependencies": {
|
| 9699 |
-
"@puppeteer/browsers": "2.
|
| 9700 |
-
"
|
| 9701 |
-
"
|
| 9702 |
-
"
|
|
|
|
|
|
|
| 9703 |
},
|
| 9704 |
"bin": {
|
| 9705 |
-
"puppeteer": "lib/
|
| 9706 |
},
|
| 9707 |
"engines": {
|
| 9708 |
"node": ">=18"
|
| 9709 |
}
|
| 9710 |
},
|
| 9711 |
"node_modules/puppeteer-core": {
|
| 9712 |
-
"version": "
|
| 9713 |
-
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-
|
| 9714 |
-
"integrity": "sha512-
|
| 9715 |
"dependencies": {
|
| 9716 |
-
"@puppeteer/browsers": "2.
|
| 9717 |
-
"chromium-bidi": "0.
|
| 9718 |
-
"debug": "4.3.
|
| 9719 |
-
"devtools-protocol": "0.0.
|
| 9720 |
-
"
|
|
|
|
| 9721 |
},
|
| 9722 |
"engines": {
|
| 9723 |
"node": ">=18"
|
|
@@ -10378,11 +10322,6 @@
|
|
| 10378 |
"url": "https://github.com/sponsors/isaacs"
|
| 10379 |
}
|
| 10380 |
},
|
| 10381 |
-
"node_modules/rrweb-cssom": {
|
| 10382 |
-
"version": "0.6.0",
|
| 10383 |
-
"resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
|
| 10384 |
-
"integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw=="
|
| 10385 |
-
},
|
| 10386 |
"node_modules/run-parallel": {
|
| 10387 |
"version": "1.2.0",
|
| 10388 |
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
|
|
@@ -10479,24 +10418,10 @@
|
|
| 10479 |
"resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
|
| 10480 |
"integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
|
| 10481 |
},
|
| 10482 |
-
"node_modules/saxes": {
|
| 10483 |
-
"version": "6.0.0",
|
| 10484 |
-
"resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz",
|
| 10485 |
-
"integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==",
|
| 10486 |
-
"dependencies": {
|
| 10487 |
-
"xmlchars": "^2.2.0"
|
| 10488 |
-
},
|
| 10489 |
-
"engines": {
|
| 10490 |
-
"node": ">=v12.22.7"
|
| 10491 |
-
}
|
| 10492 |
-
},
|
| 10493 |
"node_modules/semver": {
|
| 10494 |
-
"version": "7.6.
|
| 10495 |
-
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.
|
| 10496 |
-
"integrity": "sha512-
|
| 10497 |
-
"dependencies": {
|
| 10498 |
-
"lru-cache": "^6.0.0"
|
| 10499 |
-
},
|
| 10500 |
"bin": {
|
| 10501 |
"semver": "bin/semver.js"
|
| 10502 |
},
|
|
@@ -10504,22 +10429,6 @@
|
|
| 10504 |
"node": ">=10"
|
| 10505 |
}
|
| 10506 |
},
|
| 10507 |
-
"node_modules/semver/node_modules/lru-cache": {
|
| 10508 |
-
"version": "6.0.0",
|
| 10509 |
-
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
|
| 10510 |
-
"integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
|
| 10511 |
-
"dependencies": {
|
| 10512 |
-
"yallist": "^4.0.0"
|
| 10513 |
-
},
|
| 10514 |
-
"engines": {
|
| 10515 |
-
"node": ">=10"
|
| 10516 |
-
}
|
| 10517 |
-
},
|
| 10518 |
-
"node_modules/semver/node_modules/yallist": {
|
| 10519 |
-
"version": "4.0.0",
|
| 10520 |
-
"resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz",
|
| 10521 |
-
"integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A=="
|
| 10522 |
-
},
|
| 10523 |
"node_modules/send": {
|
| 10524 |
"version": "0.18.0",
|
| 10525 |
"resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
|
|
@@ -10567,11 +10476,6 @@
|
|
| 10567 |
"node": ">=4"
|
| 10568 |
}
|
| 10569 |
},
|
| 10570 |
-
"node_modules/send/node_modules/ms": {
|
| 10571 |
-
"version": "2.1.3",
|
| 10572 |
-
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
| 10573 |
-
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
|
| 10574 |
-
},
|
| 10575 |
"node_modules/serve-static": {
|
| 10576 |
"version": "1.15.0",
|
| 10577 |
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
|
|
@@ -10876,7 +10780,6 @@
|
|
| 10876 |
"version": "3.0.3",
|
| 10877 |
"resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz",
|
| 10878 |
"integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==",
|
| 10879 |
-
"optional": true,
|
| 10880 |
"peer": true,
|
| 10881 |
"dependencies": {
|
| 10882 |
"memory-pager": "^1.0.2"
|
|
@@ -10958,12 +10861,13 @@
|
|
| 10958 |
}
|
| 10959 |
},
|
| 10960 |
"node_modules/streamx": {
|
| 10961 |
-
"version": "2.
|
| 10962 |
-
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.
|
| 10963 |
-
"integrity": "sha512-
|
| 10964 |
"dependencies": {
|
| 10965 |
-
"fast-fifo": "^1.
|
| 10966 |
-
"queue-tick": "^1.0.1"
|
|
|
|
| 10967 |
},
|
| 10968 |
"optionalDependencies": {
|
| 10969 |
"bare-events": "^2.2.0"
|
|
@@ -11150,11 +11054,6 @@
|
|
| 11150 |
"url": "https://github.com/sponsors/ljharb"
|
| 11151 |
}
|
| 11152 |
},
|
| 11153 |
-
"node_modules/symbol-tree": {
|
| 11154 |
-
"version": "3.2.4",
|
| 11155 |
-
"resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
|
| 11156 |
-
"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
|
| 11157 |
-
},
|
| 11158 |
"node_modules/tar": {
|
| 11159 |
"version": "6.2.1",
|
| 11160 |
"resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
|
|
@@ -11172,9 +11071,9 @@
|
|
| 11172 |
}
|
| 11173 |
},
|
| 11174 |
"node_modules/tar-fs": {
|
| 11175 |
-
"version": "3.0.
|
| 11176 |
-
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.
|
| 11177 |
-
"integrity": "sha512-
|
| 11178 |
"dependencies": {
|
| 11179 |
"pump": "^3.0.0",
|
| 11180 |
"tar-stream": "^3.1.5"
|
|
@@ -11263,6 +11162,14 @@
|
|
| 11263 |
"url": "https://github.com/sponsors/isaacs"
|
| 11264 |
}
|
| 11265 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11266 |
"node_modules/text-table": {
|
| 11267 |
"version": "0.2.0",
|
| 11268 |
"resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
|
|
@@ -11369,15 +11276,15 @@
|
|
| 11369 |
}
|
| 11370 |
},
|
| 11371 |
"node_modules/tr46": {
|
| 11372 |
-
"version": "
|
| 11373 |
-
"resolved": "https://registry.npmjs.org/tr46/-/tr46-
|
| 11374 |
-
"integrity": "sha512-
|
| 11375 |
"peer": true,
|
| 11376 |
"dependencies": {
|
| 11377 |
-
"punycode": "^2.
|
| 11378 |
},
|
| 11379 |
"engines": {
|
| 11380 |
-
"node": ">=
|
| 11381 |
}
|
| 11382 |
},
|
| 11383 |
"node_modules/ts-deepmerge": {
|
|
@@ -11613,10 +11520,15 @@
|
|
| 11613 |
"url": "https://github.com/sponsors/ljharb"
|
| 11614 |
}
|
| 11615 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11616 |
"node_modules/typescript": {
|
| 11617 |
-
"version": "5.
|
| 11618 |
-
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.
|
| 11619 |
-
"integrity": "sha512-
|
| 11620 |
"devOptional": true,
|
| 11621 |
"bin": {
|
| 11622 |
"tsc": "bin/tsc",
|
|
@@ -11626,6 +11538,11 @@
|
|
| 11626 |
"node": ">=14.17"
|
| 11627 |
}
|
| 11628 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11629 |
"node_modules/unbox-primitive": {
|
| 11630 |
"version": "1.0.2",
|
| 11631 |
"resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz",
|
|
@@ -11802,17 +11719,6 @@
|
|
| 11802 |
"node": ">= 0.8"
|
| 11803 |
}
|
| 11804 |
},
|
| 11805 |
-
"node_modules/w3c-xmlserializer": {
|
| 11806 |
-
"version": "5.0.0",
|
| 11807 |
-
"resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz",
|
| 11808 |
-
"integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==",
|
| 11809 |
-
"dependencies": {
|
| 11810 |
-
"xml-name-validator": "^5.0.0"
|
| 11811 |
-
},
|
| 11812 |
-
"engines": {
|
| 11813 |
-
"node": ">=18"
|
| 11814 |
-
}
|
| 11815 |
-
},
|
| 11816 |
"node_modules/walker": {
|
| 11817 |
"version": "1.0.8",
|
| 11818 |
"resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
|
|
@@ -11846,6 +11752,7 @@
|
|
| 11846 |
"version": "7.0.0",
|
| 11847 |
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
|
| 11848 |
"integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
|
|
|
|
| 11849 |
"engines": {
|
| 11850 |
"node": ">=12"
|
| 11851 |
}
|
|
@@ -11871,36 +11778,17 @@
|
|
| 11871 |
"node": ">=0.8.0"
|
| 11872 |
}
|
| 11873 |
},
|
| 11874 |
-
"node_modules/whatwg-encoding": {
|
| 11875 |
-
"version": "3.1.1",
|
| 11876 |
-
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
| 11877 |
-
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
| 11878 |
-
"dependencies": {
|
| 11879 |
-
"iconv-lite": "0.6.3"
|
| 11880 |
-
},
|
| 11881 |
-
"engines": {
|
| 11882 |
-
"node": ">=18"
|
| 11883 |
-
}
|
| 11884 |
-
},
|
| 11885 |
-
"node_modules/whatwg-mimetype": {
|
| 11886 |
-
"version": "4.0.0",
|
| 11887 |
-
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
|
| 11888 |
-
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
|
| 11889 |
-
"engines": {
|
| 11890 |
-
"node": ">=18"
|
| 11891 |
-
}
|
| 11892 |
-
},
|
| 11893 |
"node_modules/whatwg-url": {
|
| 11894 |
-
"version": "
|
| 11895 |
-
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-
|
| 11896 |
-
"integrity": "sha512-
|
| 11897 |
"peer": true,
|
| 11898 |
"dependencies": {
|
| 11899 |
-
"tr46": "^
|
| 11900 |
"webidl-conversions": "^7.0.0"
|
| 11901 |
},
|
| 11902 |
"engines": {
|
| 11903 |
-
"node": ">=
|
| 11904 |
}
|
| 11905 |
},
|
| 11906 |
"node_modules/which": {
|
|
@@ -11996,9 +11884,9 @@
|
|
| 11996 |
}
|
| 11997 |
},
|
| 11998 |
"node_modules/ws": {
|
| 11999 |
-
"version": "8.
|
| 12000 |
-
"resolved": "https://registry.npmjs.org/ws/-/ws-8.
|
| 12001 |
-
"integrity": "sha512-
|
| 12002 |
"engines": {
|
| 12003 |
"node": ">=10.0.0"
|
| 12004 |
},
|
|
@@ -12020,14 +11908,6 @@
|
|
| 12020 |
"resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
|
| 12021 |
"integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
|
| 12022 |
},
|
| 12023 |
-
"node_modules/xml-name-validator": {
|
| 12024 |
-
"version": "5.0.0",
|
| 12025 |
-
"resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz",
|
| 12026 |
-
"integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==",
|
| 12027 |
-
"engines": {
|
| 12028 |
-
"node": ">=18"
|
| 12029 |
-
}
|
| 12030 |
-
},
|
| 12031 |
"node_modules/xml2js": {
|
| 12032 |
"version": "0.5.0",
|
| 12033 |
"resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
|
|
@@ -12048,11 +11928,6 @@
|
|
| 12048 |
"node": ">=4.0"
|
| 12049 |
}
|
| 12050 |
},
|
| 12051 |
-
"node_modules/xmlchars": {
|
| 12052 |
-
"version": "2.2.0",
|
| 12053 |
-
"resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
|
| 12054 |
-
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
|
| 12055 |
-
},
|
| 12056 |
"node_modules/y18n": {
|
| 12057 |
"version": "5.0.8",
|
| 12058 |
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
|
@@ -12137,9 +12012,9 @@
|
|
| 12137 |
}
|
| 12138 |
},
|
| 12139 |
"node_modules/zod": {
|
| 12140 |
-
"version": "3.
|
| 12141 |
-
"resolved": "https://registry.npmjs.org/zod/-/zod-3.
|
| 12142 |
-
"integrity": "sha512-
|
| 12143 |
"funding": {
|
| 12144 |
"url": "https://github.com/sponsors/colinhacks"
|
| 12145 |
}
|
|
|
|
| 14 |
"archiver": "^6.0.1",
|
| 15 |
"axios": "^1.3.3",
|
| 16 |
"bcrypt": "^5.1.0",
|
| 17 |
+
"civkit": "^0.7.0-0f8889a",
|
| 18 |
"core-js": "^3.37.1",
|
| 19 |
"cors": "^2.8.5",
|
| 20 |
"dayjs": "^1.11.9",
|
|
|
|
| 23 |
"firebase-functions": "^4.9.0",
|
| 24 |
"htmlparser2": "^9.0.0",
|
| 25 |
"jose": "^5.1.0",
|
|
|
|
| 26 |
"langdetect": "^0.2.1",
|
| 27 |
+
"linkedom": "^0.18.4",
|
| 28 |
"maxmind": "^4.3.18",
|
| 29 |
"minio": "^7.1.3",
|
| 30 |
"openai": "^4.20.0",
|
| 31 |
"pdfjs-dist": "^4.2.67",
|
| 32 |
+
"puppeteer": "^23.3.0",
|
| 33 |
"puppeteer-extra": "^3.3.6",
|
| 34 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 35 |
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
|
|
|
| 48 |
"@types/bcrypt": "^5.0.0",
|
| 49 |
"@types/cors": "^2.8.17",
|
| 50 |
"@types/generic-pool": "^3.8.1",
|
| 51 |
+
"@types/node": "^20.14.13",
|
| 52 |
"@types/set-cookie-parser": "^2.4.7",
|
| 53 |
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
| 54 |
"@typescript-eslint/parser": "^5.12.0",
|
|
|
|
| 57 |
"eslint-plugin-import": "^2.25.4",
|
| 58 |
"firebase-functions-test": "^3.0.0",
|
| 59 |
"replicate": "^0.16.1",
|
| 60 |
+
"typescript": "^5.5.4"
|
| 61 |
},
|
| 62 |
"engines": {
|
| 63 |
"node": "20"
|
|
|
|
| 1564 |
}
|
| 1565 |
},
|
| 1566 |
"node_modules/@mongodb-js/saslprep": {
|
| 1567 |
+
"version": "1.1.9",
|
| 1568 |
+
"resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.1.9.tgz",
|
| 1569 |
+
"integrity": "sha512-tVkljjeEaAhCqTzajSdgbQ6gE6f3oneVwa3iXR6csiEwXXOFsiC6Uh9iAjAhXPtqa/XMDHWjjeNH/77m/Yq2dw==",
|
|
|
|
| 1570 |
"peer": true,
|
| 1571 |
"dependencies": {
|
| 1572 |
"sparse-bitfield": "^3.0.3"
|
|
|
|
| 1976 |
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
|
| 1977 |
},
|
| 1978 |
"node_modules/@puppeteer/browsers": {
|
| 1979 |
+
"version": "2.4.0",
|
| 1980 |
+
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.4.0.tgz",
|
| 1981 |
+
"integrity": "sha512-x8J1csfIygOwf6D6qUAZ0ASk3z63zPb7wkNeHRerCMh82qWKUrOgkuP005AJC8lDL6/evtXETGEJVcwykKT4/g==",
|
| 1982 |
+
"dependencies": {
|
| 1983 |
+
"debug": "^4.3.6",
|
| 1984 |
+
"extract-zip": "^2.0.1",
|
| 1985 |
+
"progress": "^2.0.3",
|
| 1986 |
+
"proxy-agent": "^6.4.0",
|
| 1987 |
+
"semver": "^7.6.3",
|
| 1988 |
+
"tar-fs": "^3.0.6",
|
| 1989 |
+
"unbzip2-stream": "^1.4.3",
|
| 1990 |
+
"yargs": "^17.7.2"
|
| 1991 |
},
|
| 1992 |
"bin": {
|
| 1993 |
"browsers": "lib/cjs/main-cli.js"
|
|
|
|
| 2298 |
"integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
|
| 2299 |
},
|
| 2300 |
"node_modules/@types/node": {
|
| 2301 |
+
"version": "20.14.13",
|
| 2302 |
+
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.13.tgz",
|
| 2303 |
+
"integrity": "sha512-+bHoGiZb8UiQ0+WEtmph2IWQCjIqg8MDZMAV+ppRRhUZnquF5mQkP/9vpSwJClEiSM/C7fZZExPzfU0vJTyp8w==",
|
| 2304 |
"dependencies": {
|
| 2305 |
"undici-types": "~5.26.4"
|
| 2306 |
}
|
|
|
|
| 2423 |
"peer": true
|
| 2424 |
},
|
| 2425 |
"node_modules/@types/whatwg-url": {
|
| 2426 |
+
"version": "11.0.5",
|
| 2427 |
+
"resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz",
|
| 2428 |
+
"integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==",
|
| 2429 |
"peer": true,
|
| 2430 |
"dependencies": {
|
|
|
|
| 2431 |
"@types/webidl-conversions": "*"
|
| 2432 |
}
|
| 2433 |
},
|
|
|
|
| 3225 |
"optional": true
|
| 3226 |
},
|
| 3227 |
"node_modules/bare-fs": {
|
| 3228 |
+
"version": "2.3.3",
|
| 3229 |
+
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-2.3.3.tgz",
|
| 3230 |
+
"integrity": "sha512-7RYKL+vZVCyAsMLi5SPu7QGauGGT8avnP/HO571ndEuV4MYdGXvLhtW67FuLPeEI8EiIY7zbbRR9x7x7HU0kgw==",
|
| 3231 |
"optional": true,
|
| 3232 |
"dependencies": {
|
| 3233 |
"bare-events": "^2.0.0",
|
| 3234 |
"bare-path": "^2.0.0",
|
| 3235 |
+
"bare-stream": "^2.0.0"
|
| 3236 |
}
|
| 3237 |
},
|
| 3238 |
"node_modules/bare-os": {
|
| 3239 |
+
"version": "2.4.2",
|
| 3240 |
+
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.4.2.tgz",
|
| 3241 |
+
"integrity": "sha512-HZoJwzC+rZ9lqEemTMiO0luOePoGYNBgsLLgegKR/cljiJvcDNhDZQkzC+NC5Oh0aHbdBNSOHpghwMuB5tqhjg==",
|
| 3242 |
"optional": true
|
| 3243 |
},
|
| 3244 |
"node_modules/bare-path": {
|
| 3245 |
+
"version": "2.1.3",
|
| 3246 |
+
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-2.1.3.tgz",
|
| 3247 |
+
"integrity": "sha512-lh/eITfU8hrj9Ru5quUp0Io1kJWIk1bTjzo7JH1P5dWmQ2EL4hFUlfI8FonAhSlgIfhn63p84CDY/x+PisgcXA==",
|
| 3248 |
"optional": true,
|
| 3249 |
"dependencies": {
|
| 3250 |
"bare-os": "^2.1.0"
|
| 3251 |
}
|
| 3252 |
},
|
| 3253 |
+
"node_modules/bare-stream": {
|
| 3254 |
+
"version": "2.2.1",
|
| 3255 |
+
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.2.1.tgz",
|
| 3256 |
+
"integrity": "sha512-YTB47kHwBW9zSG8LD77MIBAAQXjU2WjAkMHeeb7hUplVs6+IoM5I7uEVQNPMB7lj9r8I76UMdoMkGnCodHOLqg==",
|
| 3257 |
+
"optional": true,
|
| 3258 |
+
"dependencies": {
|
| 3259 |
+
"b4a": "^1.6.6",
|
| 3260 |
+
"streamx": "^2.18.0"
|
| 3261 |
+
}
|
| 3262 |
+
},
|
| 3263 |
"node_modules/base32.js": {
|
| 3264 |
"version": "0.1.0",
|
| 3265 |
"resolved": "https://registry.npmjs.org/base32.js/-/base32.js-0.1.0.tgz",
|
|
|
|
| 3382 |
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
| 3383 |
"integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
|
| 3384 |
},
|
| 3385 |
+
"node_modules/boolbase": {
|
| 3386 |
+
"version": "1.0.0",
|
| 3387 |
+
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
| 3388 |
+
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
|
| 3389 |
+
},
|
| 3390 |
"node_modules/brace-expansion": {
|
| 3391 |
"version": "1.1.11",
|
| 3392 |
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
|
|
|
|
| 3457 |
}
|
| 3458 |
},
|
| 3459 |
"node_modules/bson": {
|
| 3460 |
+
"version": "6.8.0",
|
| 3461 |
+
"resolved": "https://registry.npmjs.org/bson/-/bson-6.8.0.tgz",
|
| 3462 |
+
"integrity": "sha512-iOJg8pr7wq2tg/zSlCCHMi3hMm5JTOxLTagf3zxhcenHsFp+c6uOs6K7W5UE7A4QIJGtqh/ZovFNMP4mOPJynQ==",
|
| 3463 |
"peer": true,
|
| 3464 |
"engines": {
|
| 3465 |
+
"node": ">=16.20.1"
|
| 3466 |
}
|
| 3467 |
},
|
| 3468 |
"node_modules/buffer": {
|
|
|
|
| 3672 |
}
|
| 3673 |
},
|
| 3674 |
"node_modules/chromium-bidi": {
|
| 3675 |
+
"version": "0.6.5",
|
| 3676 |
+
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.6.5.tgz",
|
| 3677 |
+
"integrity": "sha512-RuLrmzYrxSb0s9SgpB+QN5jJucPduZQ/9SIe76MDxYJuecPW5mxMdacJ1f4EtgiV+R0p3sCkznTMvH0MPGFqjA==",
|
| 3678 |
"dependencies": {
|
| 3679 |
"mitt": "3.0.1",
|
| 3680 |
"urlpattern-polyfill": "10.0.0",
|
| 3681 |
+
"zod": "3.23.8"
|
| 3682 |
},
|
| 3683 |
"peerDependencies": {
|
| 3684 |
"devtools-protocol": "*"
|
|
|
|
| 3701 |
}
|
| 3702 |
},
|
| 3703 |
"node_modules/civkit": {
|
| 3704 |
+
"version": "0.7.0-0f8889a",
|
| 3705 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.7.0-0f8889a.tgz",
|
| 3706 |
+
"integrity": "sha512-T14Jk3loghFqluWUUQmuEn0hO2pwWw+tsfdG4++NPqvS2W/lclZoA1EyBIZ8Uk0MYqEp02O6BwwbAoq+g++hMw==",
|
| 3707 |
"dependencies": {
|
| 3708 |
"lodash": "^4.17.21",
|
| 3709 |
"tslib": "^2.5.0"
|
|
|
|
| 3732 |
"pino": "^8.11.0",
|
| 3733 |
"reflect-metadata": "^0.1.13",
|
| 3734 |
"smtp-server": "^3.11.0",
|
| 3735 |
+
"tld-extract": "^2.1.0",
|
| 3736 |
+
"zod": "*",
|
| 3737 |
+
"zod-openai": "*"
|
| 3738 |
},
|
| 3739 |
"peerDependencies": {
|
| 3740 |
+
"mongodb": "^6",
|
| 3741 |
+
"tsyringe": "^4"
|
| 3742 |
}
|
| 3743 |
},
|
| 3744 |
"node_modules/cjs-module-lexer": {
|
|
|
|
| 4064 |
"node": ">= 8"
|
| 4065 |
}
|
| 4066 |
},
|
| 4067 |
+
"node_modules/css-select": {
|
| 4068 |
+
"version": "5.1.0",
|
| 4069 |
+
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz",
|
| 4070 |
+
"integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==",
|
| 4071 |
"dependencies": {
|
| 4072 |
+
"boolbase": "^1.0.0",
|
| 4073 |
+
"css-what": "^6.1.0",
|
| 4074 |
+
"domhandler": "^5.0.2",
|
| 4075 |
+
"domutils": "^3.0.1",
|
| 4076 |
+
"nth-check": "^2.0.1"
|
| 4077 |
},
|
| 4078 |
+
"funding": {
|
| 4079 |
+
"url": "https://github.com/sponsors/fb55"
|
| 4080 |
+
}
|
| 4081 |
+
},
|
| 4082 |
+
"node_modules/css-what": {
|
| 4083 |
+
"version": "6.1.0",
|
| 4084 |
+
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz",
|
| 4085 |
+
"integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==",
|
| 4086 |
"engines": {
|
| 4087 |
+
"node": ">= 6"
|
| 4088 |
+
},
|
| 4089 |
+
"funding": {
|
| 4090 |
+
"url": "https://github.com/sponsors/fb55"
|
| 4091 |
}
|
| 4092 |
},
|
| 4093 |
+
"node_modules/cssom": {
|
| 4094 |
+
"version": "0.5.0",
|
| 4095 |
+
"resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz",
|
| 4096 |
+
"integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="
|
| 4097 |
+
},
|
| 4098 |
"node_modules/data-uri-to-buffer": {
|
| 4099 |
"version": "6.0.2",
|
| 4100 |
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
|
|
|
| 4103 |
"node": ">= 14"
|
| 4104 |
}
|
| 4105 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4106 |
"node_modules/data-view-buffer": {
|
| 4107 |
"version": "1.0.1",
|
| 4108 |
"resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
|
|
|
|
| 4160 |
"integrity": "sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ=="
|
| 4161 |
},
|
| 4162 |
"node_modules/debug": {
|
| 4163 |
+
"version": "4.3.7",
|
| 4164 |
+
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
|
| 4165 |
+
"integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
|
| 4166 |
"dependencies": {
|
| 4167 |
+
"ms": "^2.1.3"
|
| 4168 |
},
|
| 4169 |
"engines": {
|
| 4170 |
"node": ">=6.0"
|
|
|
|
| 4175 |
}
|
| 4176 |
}
|
| 4177 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4178 |
"node_modules/decode-uri-component": {
|
| 4179 |
"version": "0.2.2",
|
| 4180 |
"resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
|
|
|
|
| 4354 |
}
|
| 4355 |
},
|
| 4356 |
"node_modules/devtools-protocol": {
|
| 4357 |
+
"version": "0.0.1330662",
|
| 4358 |
+
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1330662.tgz",
|
| 4359 |
+
"integrity": "sha512-pzh6YQ8zZfz3iKlCvgzVCu22NdpZ8hNmwU6WnQjNVquh0A9iVosPtNLWDwaWVGyrntQlltPFztTMK5Cg6lfCuw=="
|
| 4360 |
},
|
| 4361 |
"node_modules/diff-sequences": {
|
| 4362 |
"version": "29.6.3",
|
|
|
|
| 5456 |
"@google-cloud/storage": "^7.7.0"
|
| 5457 |
}
|
| 5458 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5459 |
"node_modules/firebase-functions": {
|
| 5460 |
"version": "4.9.0",
|
| 5461 |
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-4.9.0.tgz",
|
|
|
|
| 5780 |
}
|
| 5781 |
},
|
| 5782 |
"node_modules/gcp-metadata": {
|
| 5783 |
+
"version": "5.3.0",
|
| 5784 |
+
"resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-5.3.0.tgz",
|
| 5785 |
+
"integrity": "sha512-FNTkdNEnBdlqF2oatizolQqNANMrcqJt6AAYt99B3y1aLLC8Hc5IOBb+ZnnzllodEEf6xMBp6wRcBbc16fa65w==",
|
| 5786 |
+
"optional": true,
|
| 5787 |
+
"peer": true,
|
| 5788 |
"dependencies": {
|
| 5789 |
+
"gaxios": "^5.0.0",
|
| 5790 |
"json-bigint": "^1.0.0"
|
| 5791 |
},
|
| 5792 |
"engines": {
|
| 5793 |
+
"node": ">=12"
|
| 5794 |
+
}
|
| 5795 |
+
},
|
| 5796 |
+
"node_modules/gcp-metadata/node_modules/gaxios": {
|
| 5797 |
+
"version": "5.1.3",
|
| 5798 |
+
"resolved": "https://registry.npmjs.org/gaxios/-/gaxios-5.1.3.tgz",
|
| 5799 |
+
"integrity": "sha512-95hVgBRgEIRQQQHIbnxBXeHbW4TqFk4ZDJW7wmVtvYar72FdhRIo1UGOLS2eRAKCPEdPBWu+M7+A33D9CdX9rA==",
|
| 5800 |
+
"optional": true,
|
| 5801 |
+
"peer": true,
|
| 5802 |
+
"dependencies": {
|
| 5803 |
+
"extend": "^3.0.2",
|
| 5804 |
+
"https-proxy-agent": "^5.0.0",
|
| 5805 |
+
"is-stream": "^2.0.0",
|
| 5806 |
+
"node-fetch": "^2.6.9"
|
| 5807 |
+
},
|
| 5808 |
+
"engines": {
|
| 5809 |
+
"node": ">=12"
|
| 5810 |
}
|
| 5811 |
},
|
| 5812 |
"node_modules/generic-pool": {
|
|
|
|
| 6028 |
"node": ">=14"
|
| 6029 |
}
|
| 6030 |
},
|
| 6031 |
+
"node_modules/google-auth-library/node_modules/gcp-metadata": {
|
| 6032 |
+
"version": "6.1.0",
|
| 6033 |
+
"resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.0.tgz",
|
| 6034 |
+
"integrity": "sha512-Jh/AIwwgaxan+7ZUUmRLCjtchyDiqh4KjBJ5tW3plBZb5iL/BPcso8A5DlzeD9qlw0duCamnNdpFjxwaT0KyKg==",
|
| 6035 |
+
"dependencies": {
|
| 6036 |
+
"gaxios": "^6.0.0",
|
| 6037 |
+
"json-bigint": "^1.0.0"
|
| 6038 |
+
},
|
| 6039 |
+
"engines": {
|
| 6040 |
+
"node": ">=14"
|
| 6041 |
+
}
|
| 6042 |
+
},
|
| 6043 |
"node_modules/google-gax": {
|
| 6044 |
"version": "4.3.2",
|
| 6045 |
"resolved": "https://registry.npmjs.org/google-gax/-/google-gax-4.3.2.tgz",
|
|
|
|
| 6201 |
"node": ">= 0.4"
|
| 6202 |
}
|
| 6203 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6204 |
"node_modules/html-escaper": {
|
| 6205 |
"version": "2.0.2",
|
| 6206 |
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
|
|
|
| 6389 |
"version": "0.6.3",
|
| 6390 |
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
| 6391 |
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
| 6392 |
+
"optional": true,
|
| 6393 |
"dependencies": {
|
| 6394 |
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
| 6395 |
},
|
|
|
|
| 6787 |
"node": ">=0.10.0"
|
| 6788 |
}
|
| 6789 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6790 |
"node_modules/is-regex": {
|
| 6791 |
"version": "1.1.4",
|
| 6792 |
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
|
|
|
|
| 7655 |
"node": ">=0.1.90"
|
| 7656 |
}
|
| 7657 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7658 |
"node_modules/jsesc": {
|
| 7659 |
"version": "2.5.2",
|
| 7660 |
"resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
|
|
|
|
| 8073 |
"resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
|
| 8074 |
"integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg=="
|
| 8075 |
},
|
| 8076 |
+
"node_modules/linkedom": {
|
| 8077 |
+
"version": "0.18.4",
|
| 8078 |
+
"resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.4.tgz",
|
| 8079 |
+
"integrity": "sha512-JhLErxMIEOKByMi3fURXgI1fYOzR87L1Cn0+MI9GlMckFrqFZpV1SUGox1jcKtsKN3y6JgclcQf0FzZT//BuGw==",
|
| 8080 |
+
"dependencies": {
|
| 8081 |
+
"css-select": "^5.1.0",
|
| 8082 |
+
"cssom": "^0.5.0",
|
| 8083 |
+
"html-escaper": "^3.0.3",
|
| 8084 |
+
"htmlparser2": "^9.1.0",
|
| 8085 |
+
"uhyphen": "^0.2.0"
|
| 8086 |
+
}
|
| 8087 |
+
},
|
| 8088 |
+
"node_modules/linkedom/node_modules/html-escaper": {
|
| 8089 |
+
"version": "3.0.3",
|
| 8090 |
+
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz",
|
| 8091 |
+
"integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ=="
|
| 8092 |
+
},
|
| 8093 |
"node_modules/locate-path": {
|
| 8094 |
"version": "6.0.0",
|
| 8095 |
"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
|
|
|
|
| 8269 |
"version": "1.5.0",
|
| 8270 |
"resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz",
|
| 8271 |
"integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==",
|
|
|
|
| 8272 |
"peer": true
|
| 8273 |
},
|
| 8274 |
"node_modules/merge-deep": {
|
|
|
|
| 8497 |
}
|
| 8498 |
},
|
| 8499 |
"node_modules/mongodb": {
|
| 8500 |
+
"version": "6.8.0",
|
| 8501 |
+
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.8.0.tgz",
|
| 8502 |
+
"integrity": "sha512-HGQ9NWDle5WvwMnrvUxsFYPd3JEbqD3RgABHBQRuoCEND0qzhsd0iH5ypHsf1eJ+sXmvmyKpP+FLOKY8Il7jMw==",
|
| 8503 |
"peer": true,
|
| 8504 |
"dependencies": {
|
| 8505 |
+
"@mongodb-js/saslprep": "^1.1.5",
|
| 8506 |
+
"bson": "^6.7.0",
|
| 8507 |
+
"mongodb-connection-string-url": "^3.0.0"
|
| 8508 |
},
|
| 8509 |
"engines": {
|
| 8510 |
+
"node": ">=16.20.1"
|
|
|
|
|
|
|
|
|
|
| 8511 |
},
|
| 8512 |
"peerDependencies": {
|
| 8513 |
"@aws-sdk/credential-providers": "^3.188.0",
|
| 8514 |
+
"@mongodb-js/zstd": "^1.1.0",
|
| 8515 |
+
"gcp-metadata": "^5.2.0",
|
| 8516 |
+
"kerberos": "^2.0.1",
|
| 8517 |
+
"mongodb-client-encryption": ">=6.0.0 <7",
|
| 8518 |
+
"snappy": "^7.2.2",
|
| 8519 |
+
"socks": "^2.7.1"
|
| 8520 |
},
|
| 8521 |
"peerDependenciesMeta": {
|
| 8522 |
"@aws-sdk/credential-providers": {
|
|
|
|
| 8525 |
"@mongodb-js/zstd": {
|
| 8526 |
"optional": true
|
| 8527 |
},
|
| 8528 |
+
"gcp-metadata": {
|
| 8529 |
+
"optional": true
|
| 8530 |
+
},
|
| 8531 |
"kerberos": {
|
| 8532 |
"optional": true
|
| 8533 |
},
|
|
|
|
| 8536 |
},
|
| 8537 |
"snappy": {
|
| 8538 |
"optional": true
|
| 8539 |
+
},
|
| 8540 |
+
"socks": {
|
| 8541 |
+
"optional": true
|
| 8542 |
}
|
| 8543 |
}
|
| 8544 |
},
|
| 8545 |
"node_modules/mongodb-connection-string-url": {
|
| 8546 |
+
"version": "3.0.1",
|
| 8547 |
+
"resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.1.tgz",
|
| 8548 |
+
"integrity": "sha512-XqMGwRX0Lgn05TDB4PyG2h2kKO/FfWJyCzYQbIhXUxz7ETt0I/FqHjUeqj37irJ+Dl1ZtU82uYyj14u2XsZKfg==",
|
| 8549 |
"peer": true,
|
| 8550 |
"dependencies": {
|
| 8551 |
+
"@types/whatwg-url": "^11.0.2",
|
| 8552 |
+
"whatwg-url": "^13.0.0"
|
| 8553 |
}
|
| 8554 |
},
|
| 8555 |
"node_modules/ms": {
|
| 8556 |
+
"version": "2.1.3",
|
| 8557 |
+
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
| 8558 |
+
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
|
| 8559 |
},
|
| 8560 |
"node_modules/napi-build-utils": {
|
| 8561 |
"version": "1.0.2",
|
|
|
|
| 8774 |
"set-blocking": "^2.0.0"
|
| 8775 |
}
|
| 8776 |
},
|
| 8777 |
+
"node_modules/nth-check": {
|
| 8778 |
+
"version": "2.1.1",
|
| 8779 |
+
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
|
| 8780 |
+
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
|
| 8781 |
+
"dependencies": {
|
| 8782 |
+
"boolbase": "^1.0.0"
|
| 8783 |
+
},
|
| 8784 |
+
"funding": {
|
| 8785 |
+
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
| 8786 |
+
}
|
| 8787 |
},
|
| 8788 |
"node_modules/object-assign": {
|
| 8789 |
"version": "4.1.1",
|
|
|
|
| 8953 |
"openai": "bin/cli"
|
| 8954 |
}
|
| 8955 |
},
|
| 8956 |
+
"node_modules/openai/node_modules/@types/node": {
|
| 8957 |
+
"version": "18.19.42",
|
| 8958 |
+
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.42.tgz",
|
| 8959 |
+
"integrity": "sha512-d2ZFc/3lnK2YCYhos8iaNIYu9Vfhr92nHiyJHRltXWjXUBjEE+A4I58Tdbnw4VhggSW+2j5y5gTrLs4biNnubg==",
|
| 8960 |
+
"dependencies": {
|
| 8961 |
+
"undici-types": "~5.26.4"
|
| 8962 |
+
}
|
| 8963 |
+
},
|
| 8964 |
"node_modules/optionator": {
|
| 8965 |
"version": "0.9.3",
|
| 8966 |
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
|
|
|
|
| 9107 |
"url": "https://github.com/sponsors/sindresorhus"
|
| 9108 |
}
|
| 9109 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9110 |
"node_modules/parseurl": {
|
| 9111 |
"version": "1.3.3",
|
| 9112 |
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
|
|
|
| 9632 |
}
|
| 9633 |
},
|
| 9634 |
"node_modules/puppeteer": {
|
| 9635 |
+
"version": "23.3.0",
|
| 9636 |
+
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-23.3.0.tgz",
|
| 9637 |
+
"integrity": "sha512-e2jY8cdWSUGsrLxqGm3hIbJq/UIk1uOY8XY7SM51leXkH7shrIyE91lK90Q9byX6tte+cyL3HKqlWBEd6TjWTA==",
|
| 9638 |
"hasInstallScript": true,
|
| 9639 |
"dependencies": {
|
| 9640 |
+
"@puppeteer/browsers": "2.4.0",
|
| 9641 |
+
"chromium-bidi": "0.6.5",
|
| 9642 |
+
"cosmiconfig": "^9.0.0",
|
| 9643 |
+
"devtools-protocol": "0.0.1330662",
|
| 9644 |
+
"puppeteer-core": "23.3.0",
|
| 9645 |
+
"typed-query-selector": "^2.12.0"
|
| 9646 |
},
|
| 9647 |
"bin": {
|
| 9648 |
+
"puppeteer": "lib/cjs/puppeteer/node/cli.js"
|
| 9649 |
},
|
| 9650 |
"engines": {
|
| 9651 |
"node": ">=18"
|
| 9652 |
}
|
| 9653 |
},
|
| 9654 |
"node_modules/puppeteer-core": {
|
| 9655 |
+
"version": "23.3.0",
|
| 9656 |
+
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-23.3.0.tgz",
|
| 9657 |
+
"integrity": "sha512-sB2SsVMFs4gKad5OCdv6w5vocvtEUrRl0zQqSyRPbo/cj1Ktbarmhxy02Zyb9R9HrssBcJDZbkrvBnbaesPyYg==",
|
| 9658 |
"dependencies": {
|
| 9659 |
+
"@puppeteer/browsers": "2.4.0",
|
| 9660 |
+
"chromium-bidi": "0.6.5",
|
| 9661 |
+
"debug": "^4.3.6",
|
| 9662 |
+
"devtools-protocol": "0.0.1330662",
|
| 9663 |
+
"typed-query-selector": "^2.12.0",
|
| 9664 |
+
"ws": "^8.18.0"
|
| 9665 |
},
|
| 9666 |
"engines": {
|
| 9667 |
"node": ">=18"
|
|
|
|
| 10322 |
"url": "https://github.com/sponsors/isaacs"
|
| 10323 |
}
|
| 10324 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10325 |
"node_modules/run-parallel": {
|
| 10326 |
"version": "1.2.0",
|
| 10327 |
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
|
|
|
|
| 10418 |
"resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
|
| 10419 |
"integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
|
| 10420 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10421 |
"node_modules/semver": {
|
| 10422 |
+
"version": "7.6.3",
|
| 10423 |
+
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz",
|
| 10424 |
+
"integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==",
|
|
|
|
|
|
|
|
|
|
| 10425 |
"bin": {
|
| 10426 |
"semver": "bin/semver.js"
|
| 10427 |
},
|
|
|
|
| 10429 |
"node": ">=10"
|
| 10430 |
}
|
| 10431 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10432 |
"node_modules/send": {
|
| 10433 |
"version": "0.18.0",
|
| 10434 |
"resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
|
|
|
|
| 10476 |
"node": ">=4"
|
| 10477 |
}
|
| 10478 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10479 |
"node_modules/serve-static": {
|
| 10480 |
"version": "1.15.0",
|
| 10481 |
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
|
|
|
|
| 10780 |
"version": "3.0.3",
|
| 10781 |
"resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz",
|
| 10782 |
"integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==",
|
|
|
|
| 10783 |
"peer": true,
|
| 10784 |
"dependencies": {
|
| 10785 |
"memory-pager": "^1.0.2"
|
|
|
|
| 10861 |
}
|
| 10862 |
},
|
| 10863 |
"node_modules/streamx": {
|
| 10864 |
+
"version": "2.20.0",
|
| 10865 |
+
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.20.0.tgz",
|
| 10866 |
+
"integrity": "sha512-ZGd1LhDeGFucr1CUCTBOS58ZhEendd0ttpGT3usTvosS4ntIwKN9LJFp+OeCSprsCPL14BXVRZlHGRY1V9PVzQ==",
|
| 10867 |
"dependencies": {
|
| 10868 |
+
"fast-fifo": "^1.3.2",
|
| 10869 |
+
"queue-tick": "^1.0.1",
|
| 10870 |
+
"text-decoder": "^1.1.0"
|
| 10871 |
},
|
| 10872 |
"optionalDependencies": {
|
| 10873 |
"bare-events": "^2.2.0"
|
|
|
|
| 11054 |
"url": "https://github.com/sponsors/ljharb"
|
| 11055 |
}
|
| 11056 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11057 |
"node_modules/tar": {
|
| 11058 |
"version": "6.2.1",
|
| 11059 |
"resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
|
|
|
|
| 11071 |
}
|
| 11072 |
},
|
| 11073 |
"node_modules/tar-fs": {
|
| 11074 |
+
"version": "3.0.6",
|
| 11075 |
+
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.6.tgz",
|
| 11076 |
+
"integrity": "sha512-iokBDQQkUyeXhgPYaZxmczGPhnhXZ0CmrqI+MOb/WFGS9DW5wnfrLgtjUJBvz50vQ3qfRwJ62QVoCFu8mPVu5w==",
|
| 11077 |
"dependencies": {
|
| 11078 |
"pump": "^3.0.0",
|
| 11079 |
"tar-stream": "^3.1.5"
|
|
|
|
| 11162 |
"url": "https://github.com/sponsors/isaacs"
|
| 11163 |
}
|
| 11164 |
},
|
| 11165 |
+
"node_modules/text-decoder": {
|
| 11166 |
+
"version": "1.1.1",
|
| 11167 |
+
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.1.1.tgz",
|
| 11168 |
+
"integrity": "sha512-8zll7REEv4GDD3x4/0pW+ppIxSNs7H1J10IKFZsuOMscumCdM2a+toDGLPA3T+1+fLBql4zbt5z83GEQGGV5VA==",
|
| 11169 |
+
"dependencies": {
|
| 11170 |
+
"b4a": "^1.6.4"
|
| 11171 |
+
}
|
| 11172 |
+
},
|
| 11173 |
"node_modules/text-table": {
|
| 11174 |
"version": "0.2.0",
|
| 11175 |
"resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
|
|
|
|
| 11276 |
}
|
| 11277 |
},
|
| 11278 |
"node_modules/tr46": {
|
| 11279 |
+
"version": "4.1.1",
|
| 11280 |
+
"resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz",
|
| 11281 |
+
"integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==",
|
| 11282 |
"peer": true,
|
| 11283 |
"dependencies": {
|
| 11284 |
+
"punycode": "^2.3.0"
|
| 11285 |
},
|
| 11286 |
"engines": {
|
| 11287 |
+
"node": ">=14"
|
| 11288 |
}
|
| 11289 |
},
|
| 11290 |
"node_modules/ts-deepmerge": {
|
|
|
|
| 11520 |
"url": "https://github.com/sponsors/ljharb"
|
| 11521 |
}
|
| 11522 |
},
|
| 11523 |
+
"node_modules/typed-query-selector": {
|
| 11524 |
+
"version": "2.12.0",
|
| 11525 |
+
"resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
|
| 11526 |
+
"integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg=="
|
| 11527 |
+
},
|
| 11528 |
"node_modules/typescript": {
|
| 11529 |
+
"version": "5.5.4",
|
| 11530 |
+
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
|
| 11531 |
+
"integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
|
| 11532 |
"devOptional": true,
|
| 11533 |
"bin": {
|
| 11534 |
"tsc": "bin/tsc",
|
|
|
|
| 11538 |
"node": ">=14.17"
|
| 11539 |
}
|
| 11540 |
},
|
| 11541 |
+
"node_modules/uhyphen": {
|
| 11542 |
+
"version": "0.2.0",
|
| 11543 |
+
"resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz",
|
| 11544 |
+
"integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="
|
| 11545 |
+
},
|
| 11546 |
"node_modules/unbox-primitive": {
|
| 11547 |
"version": "1.0.2",
|
| 11548 |
"resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz",
|
|
|
|
| 11719 |
"node": ">= 0.8"
|
| 11720 |
}
|
| 11721 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11722 |
"node_modules/walker": {
|
| 11723 |
"version": "1.0.8",
|
| 11724 |
"resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
|
|
|
|
| 11752 |
"version": "7.0.0",
|
| 11753 |
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
|
| 11754 |
"integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
|
| 11755 |
+
"peer": true,
|
| 11756 |
"engines": {
|
| 11757 |
"node": ">=12"
|
| 11758 |
}
|
|
|
|
| 11778 |
"node": ">=0.8.0"
|
| 11779 |
}
|
| 11780 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11781 |
"node_modules/whatwg-url": {
|
| 11782 |
+
"version": "13.0.0",
|
| 11783 |
+
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-13.0.0.tgz",
|
| 11784 |
+
"integrity": "sha512-9WWbymnqj57+XEuqADHrCJ2eSXzn8WXIW/YSGaZtb2WKAInQ6CHfaUUcTyyver0p8BDg5StLQq8h1vtZuwmOig==",
|
| 11785 |
"peer": true,
|
| 11786 |
"dependencies": {
|
| 11787 |
+
"tr46": "^4.1.1",
|
| 11788 |
"webidl-conversions": "^7.0.0"
|
| 11789 |
},
|
| 11790 |
"engines": {
|
| 11791 |
+
"node": ">=16"
|
| 11792 |
}
|
| 11793 |
},
|
| 11794 |
"node_modules/which": {
|
|
|
|
| 11884 |
}
|
| 11885 |
},
|
| 11886 |
"node_modules/ws": {
|
| 11887 |
+
"version": "8.18.0",
|
| 11888 |
+
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz",
|
| 11889 |
+
"integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==",
|
| 11890 |
"engines": {
|
| 11891 |
"node": ">=10.0.0"
|
| 11892 |
},
|
|
|
|
| 11908 |
"resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
|
| 11909 |
"integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
|
| 11910 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11911 |
"node_modules/xml2js": {
|
| 11912 |
"version": "0.5.0",
|
| 11913 |
"resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
|
|
|
|
| 11928 |
"node": ">=4.0"
|
| 11929 |
}
|
| 11930 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11931 |
"node_modules/y18n": {
|
| 11932 |
"version": "5.0.8",
|
| 11933 |
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
|
|
|
| 12012 |
}
|
| 12013 |
},
|
| 12014 |
"node_modules/zod": {
|
| 12015 |
+
"version": "3.23.8",
|
| 12016 |
+
"resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz",
|
| 12017 |
+
"integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==",
|
| 12018 |
"funding": {
|
| 12019 |
"url": "https://github.com/sponsors/colinhacks"
|
| 12020 |
}
|
backend/functions/package.json
CHANGED
|
@@ -34,7 +34,7 @@
|
|
| 34 |
"archiver": "^6.0.1",
|
| 35 |
"axios": "^1.3.3",
|
| 36 |
"bcrypt": "^5.1.0",
|
| 37 |
-
"civkit": "^0.
|
| 38 |
"core-js": "^3.37.1",
|
| 39 |
"cors": "^2.8.5",
|
| 40 |
"dayjs": "^1.11.9",
|
|
@@ -43,13 +43,13 @@
|
|
| 43 |
"firebase-functions": "^4.9.0",
|
| 44 |
"htmlparser2": "^9.0.0",
|
| 45 |
"jose": "^5.1.0",
|
| 46 |
-
"jsdom": "^24.0.0",
|
| 47 |
"langdetect": "^0.2.1",
|
|
|
|
| 48 |
"maxmind": "^4.3.18",
|
| 49 |
"minio": "^7.1.3",
|
| 50 |
"openai": "^4.20.0",
|
| 51 |
"pdfjs-dist": "^4.2.67",
|
| 52 |
-
"puppeteer": "^
|
| 53 |
"puppeteer-extra": "^3.3.6",
|
| 54 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 55 |
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
|
@@ -68,7 +68,7 @@
|
|
| 68 |
"@types/bcrypt": "^5.0.0",
|
| 69 |
"@types/cors": "^2.8.17",
|
| 70 |
"@types/generic-pool": "^3.8.1",
|
| 71 |
-
"@types/node": "^
|
| 72 |
"@types/set-cookie-parser": "^2.4.7",
|
| 73 |
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
| 74 |
"@typescript-eslint/parser": "^5.12.0",
|
|
@@ -77,7 +77,7 @@
|
|
| 77 |
"eslint-plugin-import": "^2.25.4",
|
| 78 |
"firebase-functions-test": "^3.0.0",
|
| 79 |
"replicate": "^0.16.1",
|
| 80 |
-
"typescript": "^5.
|
| 81 |
},
|
| 82 |
"private": true,
|
| 83 |
"exports": {
|
|
|
|
| 34 |
"archiver": "^6.0.1",
|
| 35 |
"axios": "^1.3.3",
|
| 36 |
"bcrypt": "^5.1.0",
|
| 37 |
+
"civkit": "^0.7.0-0f8889a",
|
| 38 |
"core-js": "^3.37.1",
|
| 39 |
"cors": "^2.8.5",
|
| 40 |
"dayjs": "^1.11.9",
|
|
|
|
| 43 |
"firebase-functions": "^4.9.0",
|
| 44 |
"htmlparser2": "^9.0.0",
|
| 45 |
"jose": "^5.1.0",
|
|
|
|
| 46 |
"langdetect": "^0.2.1",
|
| 47 |
+
"linkedom": "^0.18.4",
|
| 48 |
"maxmind": "^4.3.18",
|
| 49 |
"minio": "^7.1.3",
|
| 50 |
"openai": "^4.20.0",
|
| 51 |
"pdfjs-dist": "^4.2.67",
|
| 52 |
+
"puppeteer": "^23.3.0",
|
| 53 |
"puppeteer-extra": "^3.3.6",
|
| 54 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 55 |
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
|
|
|
| 68 |
"@types/bcrypt": "^5.0.0",
|
| 69 |
"@types/cors": "^2.8.17",
|
| 70 |
"@types/generic-pool": "^3.8.1",
|
| 71 |
+
"@types/node": "^20.14.13",
|
| 72 |
"@types/set-cookie-parser": "^2.4.7",
|
| 73 |
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
| 74 |
"@typescript-eslint/parser": "^5.12.0",
|
|
|
|
| 77 |
"eslint-plugin-import": "^2.25.4",
|
| 78 |
"firebase-functions-test": "^3.0.0",
|
| 79 |
"replicate": "^0.16.1",
|
| 80 |
+
"typescript": "^5.5.4"
|
| 81 |
},
|
| 82 |
"private": true,
|
| 83 |
"exports": {
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import {
|
| 2 |
assignTransferProtocolMeta, marshalErrorLike,
|
| 3 |
RPCHost, RPCReflection,
|
| 4 |
-
HashManager,
|
| 5 |
AssertionFailureError, ParamValidationError, Defer,
|
| 6 |
} from 'civkit';
|
| 7 |
import { singleton } from 'tsyringe';
|
|
@@ -11,22 +10,17 @@ import _ from 'lodash';
|
|
| 11 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
| 12 |
import { Request, Response } from 'express';
|
| 13 |
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
| 14 |
-
import { AltTextService } from '../services/alt-text';
|
| 15 |
-
import TurndownService from 'turndown';
|
| 16 |
import { Crawled } from '../db/crawled';
|
| 17 |
-
import { cleanAttribute } from '../utils/misc';
|
| 18 |
import { randomUUID } from 'crypto';
|
| 19 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 20 |
|
| 21 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 22 |
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
| 23 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 24 |
-
import { PDFExtractor } from '../services/pdf-extract';
|
| 25 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 26 |
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
| 27 |
import { JSDomControl } from '../services/jsdom';
|
| 28 |
-
|
| 29 |
-
const md5Hasher = new HashManager('md5', 'hex');
|
| 30 |
|
| 31 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 32 |
withIframe?: boolean;
|
|
@@ -35,29 +29,6 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
|
| 35 |
keepImgDataUrl?: boolean;
|
| 36 |
}
|
| 37 |
|
| 38 |
-
export interface FormattedPage {
|
| 39 |
-
title?: string;
|
| 40 |
-
description?: string;
|
| 41 |
-
url?: string;
|
| 42 |
-
content?: string;
|
| 43 |
-
publishedTime?: string;
|
| 44 |
-
html?: string;
|
| 45 |
-
text?: string;
|
| 46 |
-
screenshotUrl?: string;
|
| 47 |
-
screenshot?: Buffer;
|
| 48 |
-
pageshotUrl?: string;
|
| 49 |
-
pageshot?: Buffer;
|
| 50 |
-
links?: { [k: string]: string; };
|
| 51 |
-
images?: { [k: string]: string; };
|
| 52 |
-
usage?: {
|
| 53 |
-
total_tokens?: number;
|
| 54 |
-
totalTokens?: number;
|
| 55 |
-
tokens?: number;
|
| 56 |
-
};
|
| 57 |
-
|
| 58 |
-
toString: () => string;
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
const indexProto = {
|
| 62 |
toString: function (): string {
|
| 63 |
return _(this)
|
|
@@ -72,8 +43,6 @@ const indexProto = {
|
|
| 72 |
export class CrawlerHost extends RPCHost {
|
| 73 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 74 |
|
| 75 |
-
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
| 76 |
-
|
| 77 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 78 |
cacheValidMs = 1000 * 3600;
|
| 79 |
urlValidMs = 1000 * 3600 * 4;
|
|
@@ -83,8 +52,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 83 |
protected globalLogger: Logger,
|
| 84 |
protected puppeteerControl: PuppeteerControl,
|
| 85 |
protected jsdomControl: JSDomControl,
|
| 86 |
-
protected
|
| 87 |
-
protected pdfExtractor: PDFExtractor,
|
| 88 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 89 |
protected rateLimitControl: RateLimitControl,
|
| 90 |
protected threadLocal: AsyncContext,
|
|
@@ -148,448 +116,6 @@ export class CrawlerHost extends RPCHost {
|
|
| 148 |
return indexObject;
|
| 149 |
}
|
| 150 |
|
| 151 |
-
getTurndown(options?: {
|
| 152 |
-
noRules?: boolean | string,
|
| 153 |
-
url?: string | URL;
|
| 154 |
-
imgDataUrlToObjectUrl?: boolean;
|
| 155 |
-
}) {
|
| 156 |
-
const turnDownService = new TurndownService({
|
| 157 |
-
codeBlockStyle: 'fenced',
|
| 158 |
-
preformattedCode: true,
|
| 159 |
-
} as any);
|
| 160 |
-
if (!options?.noRules) {
|
| 161 |
-
turnDownService.addRule('remove-irrelevant', {
|
| 162 |
-
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
| 163 |
-
replacement: () => ''
|
| 164 |
-
});
|
| 165 |
-
turnDownService.addRule('truncate-svg', {
|
| 166 |
-
filter: 'svg' as any,
|
| 167 |
-
replacement: () => ''
|
| 168 |
-
});
|
| 169 |
-
turnDownService.addRule('title-as-h1', {
|
| 170 |
-
filter: ['title'],
|
| 171 |
-
replacement: (innerText) => `${innerText}\n===============\n`
|
| 172 |
-
});
|
| 173 |
-
}
|
| 174 |
-
|
| 175 |
-
if (options?.imgDataUrlToObjectUrl) {
|
| 176 |
-
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
| 177 |
-
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
| 178 |
-
replacement: (_content, node: any) => {
|
| 179 |
-
const src = (node.getAttribute('src') || '').trim();
|
| 180 |
-
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
| 181 |
-
|
| 182 |
-
if (options.url) {
|
| 183 |
-
const refUrl = new URL(options.url);
|
| 184 |
-
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
| 185 |
-
|
| 186 |
-
return ``;
|
| 187 |
-
}
|
| 188 |
-
|
| 189 |
-
return `})`;
|
| 190 |
-
}
|
| 191 |
-
});
|
| 192 |
-
}
|
| 193 |
-
|
| 194 |
-
turnDownService.addRule('improved-paragraph', {
|
| 195 |
-
filter: 'p',
|
| 196 |
-
replacement: (innerText) => {
|
| 197 |
-
const trimmed = innerText.trim();
|
| 198 |
-
if (!trimmed) {
|
| 199 |
-
return '';
|
| 200 |
-
}
|
| 201 |
-
|
| 202 |
-
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
|
| 203 |
-
}
|
| 204 |
-
});
|
| 205 |
-
turnDownService.addRule('improved-inline-link', {
|
| 206 |
-
filter: function (node, options) {
|
| 207 |
-
return Boolean(
|
| 208 |
-
options.linkStyle === 'inlined' &&
|
| 209 |
-
node.nodeName === 'A' &&
|
| 210 |
-
node.getAttribute('href')
|
| 211 |
-
);
|
| 212 |
-
},
|
| 213 |
-
|
| 214 |
-
replacement: function (content, node: any) {
|
| 215 |
-
let href = node.getAttribute('href');
|
| 216 |
-
if (href) href = href.replace(/([()])/g, '\\$1');
|
| 217 |
-
let title = cleanAttribute(node.getAttribute('title'));
|
| 218 |
-
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
| 219 |
-
|
| 220 |
-
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
| 221 |
-
let fixedHref = href.replace(/\s+/g, '').trim();
|
| 222 |
-
if (options?.url) {
|
| 223 |
-
try {
|
| 224 |
-
fixedHref = new URL(fixedHref, options.url).toString();
|
| 225 |
-
} catch (_err) {
|
| 226 |
-
void 0;
|
| 227 |
-
}
|
| 228 |
-
}
|
| 229 |
-
|
| 230 |
-
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
| 231 |
-
}
|
| 232 |
-
});
|
| 233 |
-
turnDownService.addRule('improved-code', {
|
| 234 |
-
filter: function (node: any) {
|
| 235 |
-
let hasSiblings = node.previousSibling || node.nextSibling;
|
| 236 |
-
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
|
| 237 |
-
|
| 238 |
-
return node.nodeName === 'CODE' && !isCodeBlock;
|
| 239 |
-
},
|
| 240 |
-
|
| 241 |
-
replacement: function (inputContent: any) {
|
| 242 |
-
if (!inputContent) return '';
|
| 243 |
-
let content = inputContent;
|
| 244 |
-
|
| 245 |
-
let delimiter = '`';
|
| 246 |
-
let matches = content.match(/`+/gm) || [];
|
| 247 |
-
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
|
| 248 |
-
if (content.includes('\n')) {
|
| 249 |
-
delimiter = '```';
|
| 250 |
-
}
|
| 251 |
-
|
| 252 |
-
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
|
| 253 |
-
|
| 254 |
-
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
|
| 255 |
-
}
|
| 256 |
-
});
|
| 257 |
-
|
| 258 |
-
return turnDownService;
|
| 259 |
-
}
|
| 260 |
-
|
| 261 |
-
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 262 |
-
let inferred;
|
| 263 |
-
const mixin: any = {};
|
| 264 |
-
if (this.threadLocal.get('withImagesSummary')) {
|
| 265 |
-
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 266 |
-
const imageSummary = {} as { [k: string]: string; };
|
| 267 |
-
const imageIdxTrack = new Map<string, number[]>();
|
| 268 |
-
|
| 269 |
-
let imgIdx = 0;
|
| 270 |
-
|
| 271 |
-
for (const img of inferred.imgs) {
|
| 272 |
-
const imgSerial = ++imgIdx;
|
| 273 |
-
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
|
| 274 |
-
idxArr.push(imgSerial);
|
| 275 |
-
imageIdxTrack.set(img.src, idxArr);
|
| 276 |
-
imageSummary[img.src] = img.alt || '';
|
| 277 |
-
}
|
| 278 |
-
|
| 279 |
-
mixin.images =
|
| 280 |
-
_(imageSummary)
|
| 281 |
-
.toPairs()
|
| 282 |
-
.map(
|
| 283 |
-
([url, alt], i) => {
|
| 284 |
-
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
| 285 |
-
}
|
| 286 |
-
).fromPairs()
|
| 287 |
-
.value();
|
| 288 |
-
}
|
| 289 |
-
if (this.threadLocal.get('withLinksSummary')) {
|
| 290 |
-
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 291 |
-
mixin.links = _.invert(inferred.links || {});
|
| 292 |
-
}
|
| 293 |
-
|
| 294 |
-
return mixin;
|
| 295 |
-
}
|
| 296 |
-
|
| 297 |
-
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
| 298 |
-
screenshotUrl?: string;
|
| 299 |
-
pageshotUrl?: string;
|
| 300 |
-
}, nominalUrl?: URL) {
|
| 301 |
-
if (mode === 'screenshot') {
|
| 302 |
-
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
| 303 |
-
const fid = `instant-screenshots/${randomUUID()}`;
|
| 304 |
-
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
|
| 305 |
-
metadata: {
|
| 306 |
-
contentType: 'image/png',
|
| 307 |
-
}
|
| 308 |
-
});
|
| 309 |
-
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
| 310 |
-
}
|
| 311 |
-
|
| 312 |
-
return {
|
| 313 |
-
...this.getGeneralSnapshotMixins(snapshot),
|
| 314 |
-
// html: snapshot.html,
|
| 315 |
-
screenshotUrl: snapshot.screenshotUrl,
|
| 316 |
-
toString() {
|
| 317 |
-
return this.screenshotUrl;
|
| 318 |
-
}
|
| 319 |
-
} as FormattedPage;
|
| 320 |
-
}
|
| 321 |
-
if (mode === 'pageshot') {
|
| 322 |
-
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
| 323 |
-
const fid = `instant-screenshots/${randomUUID()}`;
|
| 324 |
-
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
| 325 |
-
metadata: {
|
| 326 |
-
contentType: 'image/png',
|
| 327 |
-
}
|
| 328 |
-
});
|
| 329 |
-
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
| 330 |
-
}
|
| 331 |
-
|
| 332 |
-
return {
|
| 333 |
-
...this.getGeneralSnapshotMixins(snapshot),
|
| 334 |
-
html: snapshot.html,
|
| 335 |
-
pageshotUrl: snapshot.pageshotUrl,
|
| 336 |
-
toString() {
|
| 337 |
-
return this.pageshotUrl;
|
| 338 |
-
}
|
| 339 |
-
} as FormattedPage;
|
| 340 |
-
}
|
| 341 |
-
if (mode === 'html') {
|
| 342 |
-
return {
|
| 343 |
-
...this.getGeneralSnapshotMixins(snapshot),
|
| 344 |
-
html: snapshot.html,
|
| 345 |
-
toString() {
|
| 346 |
-
return this.html;
|
| 347 |
-
}
|
| 348 |
-
} as FormattedPage;
|
| 349 |
-
}
|
| 350 |
-
|
| 351 |
-
let pdfMode = false;
|
| 352 |
-
if (snapshot.pdfs?.length && !snapshot.title) {
|
| 353 |
-
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
| 354 |
-
this.threadLocal.get('cacheTolerance')
|
| 355 |
-
);
|
| 356 |
-
if (pdf) {
|
| 357 |
-
pdfMode = true;
|
| 358 |
-
snapshot.title = pdf.meta?.Title;
|
| 359 |
-
snapshot.text = pdf.text || snapshot.text;
|
| 360 |
-
snapshot.parsed = {
|
| 361 |
-
content: pdf.content,
|
| 362 |
-
textContent: pdf.content,
|
| 363 |
-
length: pdf.content?.length,
|
| 364 |
-
byline: pdf.meta?.Author,
|
| 365 |
-
lang: pdf.meta?.Language || undefined,
|
| 366 |
-
title: pdf.meta?.Title,
|
| 367 |
-
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
| 368 |
-
};
|
| 369 |
-
}
|
| 370 |
-
}
|
| 371 |
-
|
| 372 |
-
if (mode === 'text') {
|
| 373 |
-
return {
|
| 374 |
-
...this.getGeneralSnapshotMixins(snapshot),
|
| 375 |
-
text: snapshot.text,
|
| 376 |
-
toString() {
|
| 377 |
-
return this.text;
|
| 378 |
-
}
|
| 379 |
-
} as FormattedPage;
|
| 380 |
-
}
|
| 381 |
-
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
| 382 |
-
|
| 383 |
-
let contentText = '';
|
| 384 |
-
const imageSummary = {} as { [k: string]: string; };
|
| 385 |
-
const imageIdxTrack = new Map<string, number[]>();
|
| 386 |
-
const uid = this.threadLocal.get('uid');
|
| 387 |
-
do {
|
| 388 |
-
if (pdfMode) {
|
| 389 |
-
contentText = snapshot.parsed?.content || snapshot.text;
|
| 390 |
-
break;
|
| 391 |
-
}
|
| 392 |
-
|
| 393 |
-
if (
|
| 394 |
-
snapshot.maxElemDepth! > 256 ||
|
| 395 |
-
(!uid && snapshot.elemCount! > 10_000) ||
|
| 396 |
-
snapshot.elemCount! > 70_000
|
| 397 |
-
) {
|
| 398 |
-
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
| 399 |
-
contentText = snapshot.text;
|
| 400 |
-
break;
|
| 401 |
-
}
|
| 402 |
-
|
| 403 |
-
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 404 |
-
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 405 |
-
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 406 |
-
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 407 |
-
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 408 |
-
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
| 409 |
-
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
| 410 |
-
|
| 411 |
-
// If Readability did its job
|
| 412 |
-
if (par2.length >= 0.3 * par1.length) {
|
| 413 |
-
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 414 |
-
if (snapshot.parsed.content) {
|
| 415 |
-
toBeTurnedToMd = jsDomElementOfParsed;
|
| 416 |
-
}
|
| 417 |
-
}
|
| 418 |
-
}
|
| 419 |
-
|
| 420 |
-
for (const plugin of this.turnDownPlugins) {
|
| 421 |
-
turnDownService = turnDownService.use(plugin);
|
| 422 |
-
}
|
| 423 |
-
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 424 |
-
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
| 425 |
-
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
| 426 |
-
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 427 |
-
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 428 |
-
return undefined;
|
| 429 |
-
});
|
| 430 |
-
if (r && x.src) {
|
| 431 |
-
urlToAltMap[x.src.trim()] = r;
|
| 432 |
-
}
|
| 433 |
-
});
|
| 434 |
-
|
| 435 |
-
await Promise.all(tasks);
|
| 436 |
-
}
|
| 437 |
-
let imgIdx = 0;
|
| 438 |
-
turnDownService.addRule('img-generated-alt', {
|
| 439 |
-
filter: 'img',
|
| 440 |
-
replacement: (_content, node: any) => {
|
| 441 |
-
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
| 442 |
-
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
| 443 |
-
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
| 444 |
-
if (dataSrc && !dataSrc.startsWith('data:')) {
|
| 445 |
-
linkPreferredSrc = dataSrc;
|
| 446 |
-
}
|
| 447 |
-
}
|
| 448 |
-
|
| 449 |
-
let src;
|
| 450 |
-
try {
|
| 451 |
-
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
| 452 |
-
} catch (_err) {
|
| 453 |
-
void 0;
|
| 454 |
-
}
|
| 455 |
-
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 456 |
-
if (!src) {
|
| 457 |
-
return '';
|
| 458 |
-
}
|
| 459 |
-
const mapped = urlToAltMap[src];
|
| 460 |
-
const imgSerial = ++imgIdx;
|
| 461 |
-
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
| 462 |
-
idxArr.push(imgSerial);
|
| 463 |
-
imageIdxTrack.set(src, idxArr);
|
| 464 |
-
|
| 465 |
-
if (mapped) {
|
| 466 |
-
imageSummary[src] = mapped || alt;
|
| 467 |
-
|
| 468 |
-
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 469 |
-
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 470 |
-
mappedUrl.protocol = 'blob:';
|
| 471 |
-
|
| 472 |
-
return ``;
|
| 473 |
-
}
|
| 474 |
-
|
| 475 |
-
return ``;
|
| 476 |
-
}
|
| 477 |
-
|
| 478 |
-
imageSummary[src] = alt || '';
|
| 479 |
-
|
| 480 |
-
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 481 |
-
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 482 |
-
mappedUrl.protocol = 'blob:';
|
| 483 |
-
|
| 484 |
-
return alt ? `` : ``;
|
| 485 |
-
}
|
| 486 |
-
|
| 487 |
-
return alt ? `` : ``;
|
| 488 |
-
}
|
| 489 |
-
});
|
| 490 |
-
|
| 491 |
-
if (toBeTurnedToMd) {
|
| 492 |
-
try {
|
| 493 |
-
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 494 |
-
} catch (err) {
|
| 495 |
-
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 496 |
-
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 497 |
-
try {
|
| 498 |
-
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 499 |
-
} catch (err2) {
|
| 500 |
-
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 501 |
-
}
|
| 502 |
-
}
|
| 503 |
-
}
|
| 504 |
-
|
| 505 |
-
if (
|
| 506 |
-
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
| 507 |
-
&& toBeTurnedToMd !== jsDomElementOfHTML
|
| 508 |
-
) {
|
| 509 |
-
try {
|
| 510 |
-
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
| 511 |
-
} catch (err) {
|
| 512 |
-
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 513 |
-
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 514 |
-
try {
|
| 515 |
-
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
| 516 |
-
} catch (err2) {
|
| 517 |
-
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 518 |
-
}
|
| 519 |
-
}
|
| 520 |
-
}
|
| 521 |
-
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
| 522 |
-
contentText = snapshot.text;
|
| 523 |
-
}
|
| 524 |
-
} while (false);
|
| 525 |
-
|
| 526 |
-
const cleanText = (contentText || '').trim();
|
| 527 |
-
|
| 528 |
-
const formatted: FormattedPage = {
|
| 529 |
-
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 530 |
-
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
| 531 |
-
content: cleanText,
|
| 532 |
-
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
| 533 |
-
|
| 534 |
-
toString() {
|
| 535 |
-
if (mode === 'markdown') {
|
| 536 |
-
return this.content as string;
|
| 537 |
-
}
|
| 538 |
-
|
| 539 |
-
const mixins = [];
|
| 540 |
-
if (this.publishedTime) {
|
| 541 |
-
mixins.push(`Published Time: ${this.publishedTime}`);
|
| 542 |
-
}
|
| 543 |
-
const suffixMixins = [];
|
| 544 |
-
if (this.images) {
|
| 545 |
-
const imageSummaryChunks = ['Images:'];
|
| 546 |
-
for (const [k, v] of Object.entries(this.images)) {
|
| 547 |
-
imageSummaryChunks.push(`- `);
|
| 548 |
-
}
|
| 549 |
-
if (imageSummaryChunks.length === 1) {
|
| 550 |
-
imageSummaryChunks.push('This page does not seem to contain any images.');
|
| 551 |
-
}
|
| 552 |
-
suffixMixins.push(imageSummaryChunks.join('\n'));
|
| 553 |
-
}
|
| 554 |
-
if (this.links) {
|
| 555 |
-
const linkSummaryChunks = ['Links/Buttons:'];
|
| 556 |
-
for (const [k, v] of Object.entries(this.links)) {
|
| 557 |
-
linkSummaryChunks.push(`- [${k}](${v})`);
|
| 558 |
-
}
|
| 559 |
-
if (linkSummaryChunks.length === 1) {
|
| 560 |
-
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
| 561 |
-
}
|
| 562 |
-
suffixMixins.push(linkSummaryChunks.join('\n'));
|
| 563 |
-
}
|
| 564 |
-
|
| 565 |
-
return `Title: ${this.title}
|
| 566 |
-
|
| 567 |
-
URL Source: ${this.url}
|
| 568 |
-
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
| 569 |
-
Markdown Content:
|
| 570 |
-
${this.content}
|
| 571 |
-
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
| 572 |
-
}
|
| 573 |
-
};
|
| 574 |
-
|
| 575 |
-
if (this.threadLocal.get('withImagesSummary')) {
|
| 576 |
-
formatted.images =
|
| 577 |
-
_(imageSummary)
|
| 578 |
-
.toPairs()
|
| 579 |
-
.map(
|
| 580 |
-
([url, alt], i) => {
|
| 581 |
-
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
| 582 |
-
}
|
| 583 |
-
).fromPairs()
|
| 584 |
-
.value();
|
| 585 |
-
}
|
| 586 |
-
if (this.threadLocal.get('withLinksSummary')) {
|
| 587 |
-
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
|
| 588 |
-
}
|
| 589 |
-
|
| 590 |
-
return formatted as FormattedPage;
|
| 591 |
-
}
|
| 592 |
-
|
| 593 |
@CloudHTTPv2({
|
| 594 |
name: 'crawl2',
|
| 595 |
runtime: {
|
|
@@ -604,7 +130,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 604 |
})
|
| 605 |
@CloudHTTPv2({
|
| 606 |
runtime: {
|
| 607 |
-
memory: '
|
| 608 |
cpu: 4,
|
| 609 |
timeoutSeconds: 300,
|
| 610 |
concurrency: 22,
|
|
@@ -723,7 +249,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 723 |
continue;
|
| 724 |
}
|
| 725 |
|
| 726 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
| 727 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 728 |
sseStream.write({
|
| 729 |
event: 'data',
|
|
@@ -754,7 +280,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 754 |
continue;
|
| 755 |
}
|
| 756 |
|
| 757 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
| 758 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 759 |
|
| 760 |
if (crawlerOptions.timeout === undefined) {
|
|
@@ -770,7 +296,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 770 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 771 |
}
|
| 772 |
|
| 773 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
| 774 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 775 |
|
| 776 |
return formatted;
|
|
@@ -782,24 +308,24 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 782 |
continue;
|
| 783 |
}
|
| 784 |
|
| 785 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
| 786 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 787 |
|
| 788 |
if (crawlerOptions.timeout === undefined) {
|
| 789 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 790 |
|
| 791 |
-
return assignTransferProtocolMeta(`${formatted}`,
|
| 792 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 793 |
);
|
| 794 |
}
|
| 795 |
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 796 |
|
| 797 |
-
return assignTransferProtocolMeta(`${formatted}`,
|
| 798 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 799 |
);
|
| 800 |
}
|
| 801 |
|
| 802 |
-
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 803 |
}
|
| 804 |
}
|
| 805 |
|
|
@@ -807,22 +333,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 807 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 808 |
}
|
| 809 |
|
| 810 |
-
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
| 811 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 812 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 813 |
|
| 814 |
-
return assignTransferProtocolMeta(`${formatted}`,
|
| 815 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 816 |
);
|
| 817 |
}
|
| 818 |
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 819 |
|
| 820 |
-
return assignTransferProtocolMeta(`${formatted}`,
|
| 821 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 822 |
);
|
| 823 |
}
|
| 824 |
|
| 825 |
-
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 826 |
}
|
| 827 |
|
| 828 |
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
|
@@ -1181,7 +707,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 1181 |
|
| 1182 |
} catch (err) {
|
| 1183 |
if (lastSnapshot) {
|
| 1184 |
-
return this.formatSnapshot(mode, lastSnapshot, url);
|
| 1185 |
}
|
| 1186 |
|
| 1187 |
throw err;
|
|
@@ -1191,6 +717,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 1191 |
throw new AssertionFailureError(`No content available`);
|
| 1192 |
}
|
| 1193 |
|
| 1194 |
-
return this.formatSnapshot(mode, lastSnapshot, url);
|
| 1195 |
}
|
| 1196 |
}
|
|
|
|
| 1 |
import {
|
| 2 |
assignTransferProtocolMeta, marshalErrorLike,
|
| 3 |
RPCHost, RPCReflection,
|
|
|
|
| 4 |
AssertionFailureError, ParamValidationError, Defer,
|
| 5 |
} from 'civkit';
|
| 6 |
import { singleton } from 'tsyringe';
|
|
|
|
| 10 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
| 11 |
import { Request, Response } from 'express';
|
| 12 |
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
|
|
|
|
|
|
| 13 |
import { Crawled } from '../db/crawled';
|
|
|
|
| 14 |
import { randomUUID } from 'crypto';
|
| 15 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 16 |
|
| 17 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 18 |
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
| 19 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
|
|
|
| 20 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 21 |
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
| 22 |
import { JSDomControl } from '../services/jsdom';
|
| 23 |
+
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
|
|
|
| 24 |
|
| 25 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 26 |
withIframe?: boolean;
|
|
|
|
| 29 |
keepImgDataUrl?: boolean;
|
| 30 |
}
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
const indexProto = {
|
| 33 |
toString: function (): string {
|
| 34 |
return _(this)
|
|
|
|
| 43 |
export class CrawlerHost extends RPCHost {
|
| 44 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 45 |
|
|
|
|
|
|
|
| 46 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 47 |
cacheValidMs = 1000 * 3600;
|
| 48 |
urlValidMs = 1000 * 3600 * 4;
|
|
|
|
| 52 |
protected globalLogger: Logger,
|
| 53 |
protected puppeteerControl: PuppeteerControl,
|
| 54 |
protected jsdomControl: JSDomControl,
|
| 55 |
+
protected snapshotFormatter: SnapshotFormatter,
|
|
|
|
| 56 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 57 |
protected rateLimitControl: RateLimitControl,
|
| 58 |
protected threadLocal: AsyncContext,
|
|
|
|
| 116 |
return indexObject;
|
| 117 |
}
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
@CloudHTTPv2({
|
| 120 |
name: 'crawl2',
|
| 121 |
runtime: {
|
|
|
|
| 130 |
})
|
| 131 |
@CloudHTTPv2({
|
| 132 |
runtime: {
|
| 133 |
+
memory: '8GiB',
|
| 134 |
cpu: 4,
|
| 135 |
timeoutSeconds: 300,
|
| 136 |
concurrency: 22,
|
|
|
|
| 249 |
continue;
|
| 250 |
}
|
| 251 |
|
| 252 |
+
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 253 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 254 |
sseStream.write({
|
| 255 |
event: 'data',
|
|
|
|
| 280 |
continue;
|
| 281 |
}
|
| 282 |
|
| 283 |
+
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 284 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 285 |
|
| 286 |
if (crawlerOptions.timeout === undefined) {
|
|
|
|
| 296 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 297 |
}
|
| 298 |
|
| 299 |
+
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
| 300 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 301 |
|
| 302 |
return formatted;
|
|
|
|
| 308 |
continue;
|
| 309 |
}
|
| 310 |
|
| 311 |
+
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
| 312 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 313 |
|
| 314 |
if (crawlerOptions.timeout === undefined) {
|
| 315 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 316 |
|
| 317 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 318 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 319 |
);
|
| 320 |
}
|
| 321 |
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 322 |
|
| 323 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 324 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 325 |
);
|
| 326 |
}
|
| 327 |
|
| 328 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
| 329 |
}
|
| 330 |
}
|
| 331 |
|
|
|
|
| 333 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 334 |
}
|
| 335 |
|
| 336 |
+
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
| 337 |
chargeAmount = this.assignChargeAmount(formatted);
|
| 338 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 339 |
|
| 340 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 341 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 342 |
);
|
| 343 |
}
|
| 344 |
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 345 |
|
| 346 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 347 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 348 |
);
|
| 349 |
}
|
| 350 |
|
| 351 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
| 352 |
}
|
| 353 |
|
| 354 |
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
|
|
|
| 707 |
|
| 708 |
} catch (err) {
|
| 709 |
if (lastSnapshot) {
|
| 710 |
+
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
| 711 |
}
|
| 712 |
|
| 713 |
throw err;
|
|
|
|
| 717 |
throw new AssertionFailureError(`No content available`);
|
| 718 |
}
|
| 719 |
|
| 720 |
+
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
| 721 |
}
|
| 722 |
}
|
backend/functions/src/cloud-functions/data-crunching.ts
CHANGED
|
@@ -18,6 +18,7 @@ import { appendFile } from 'fs/promises';
|
|
| 18 |
import { createGzip } from 'zlib';
|
| 19 |
import { getFunctions } from 'firebase-admin/functions';
|
| 20 |
import { GoogleAuth } from 'google-auth-library';
|
|
|
|
| 21 |
|
| 22 |
dayjs.extend(require('dayjs/plugin/utc'));
|
| 23 |
|
|
@@ -57,6 +58,7 @@ export class DataCrunchingHost extends RPCHost {
|
|
| 57 |
protected globalLogger: Logger,
|
| 58 |
|
| 59 |
protected crawler: CrawlerHost,
|
|
|
|
| 60 |
protected tempFileManager: TempFileManager,
|
| 61 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 62 |
) {
|
|
@@ -265,9 +267,9 @@ export class DataCrunchingHost extends RPCHost {
|
|
| 265 |
try {
|
| 266 |
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
|
| 267 |
|
| 268 |
-
let formatted = await this.
|
| 269 |
if (!formatted.content) {
|
| 270 |
-
formatted = await this.
|
| 271 |
}
|
| 272 |
|
| 273 |
await nextDrainDeferred.promise;
|
|
|
|
| 18 |
import { createGzip } from 'zlib';
|
| 19 |
import { getFunctions } from 'firebase-admin/functions';
|
| 20 |
import { GoogleAuth } from 'google-auth-library';
|
| 21 |
+
import { SnapshotFormatter } from '../services/snapshot-formatter';
|
| 22 |
|
| 23 |
dayjs.extend(require('dayjs/plugin/utc'));
|
| 24 |
|
|
|
|
| 58 |
protected globalLogger: Logger,
|
| 59 |
|
| 60 |
protected crawler: CrawlerHost,
|
| 61 |
+
protected snapshotFormatter: SnapshotFormatter,
|
| 62 |
protected tempFileManager: TempFileManager,
|
| 63 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 64 |
) {
|
|
|
|
| 267 |
try {
|
| 268 |
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
|
| 269 |
|
| 270 |
+
let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot);
|
| 271 |
if (!formatted.content) {
|
| 272 |
+
formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
| 273 |
}
|
| 274 |
|
| 275 |
await nextDrainDeferred.promise;
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -11,11 +11,12 @@ import _ from 'lodash';
|
|
| 11 |
import { Request, Response } from 'express';
|
| 12 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 13 |
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
| 14 |
-
import { CrawlerHost, ExtraScrappingOptions
|
| 15 |
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 16 |
import { SearchResult } from '../db/searched';
|
| 17 |
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
| 18 |
import { CrawlerOptions } from '../dto/scrapping-options';
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
@singleton()
|
|
@@ -36,6 +37,7 @@ export class SearcherHost extends RPCHost {
|
|
| 36 |
protected threadLocal: AsyncContext,
|
| 37 |
protected braveSearchService: BraveSearchService,
|
| 38 |
protected crawler: CrawlerHost,
|
|
|
|
| 39 |
) {
|
| 40 |
super(...arguments);
|
| 41 |
}
|
|
@@ -324,7 +326,7 @@ export class SearcherHost extends RPCHost {
|
|
| 324 |
if (snapshotMap.has(x)) {
|
| 325 |
return snapshotMap.get(x);
|
| 326 |
}
|
| 327 |
-
return this.
|
| 328 |
r.title ??= upstreamSearchResult.title;
|
| 329 |
r.description = upstreamSearchResult.description;
|
| 330 |
snapshotMap.set(x, r);
|
|
|
|
| 11 |
import { Request, Response } from 'express';
|
| 12 |
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 13 |
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
| 14 |
+
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
| 15 |
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 16 |
import { SearchResult } from '../db/searched';
|
| 17 |
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
| 18 |
import { CrawlerOptions } from '../dto/scrapping-options';
|
| 19 |
+
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
| 20 |
|
| 21 |
|
| 22 |
@singleton()
|
|
|
|
| 37 |
protected threadLocal: AsyncContext,
|
| 38 |
protected braveSearchService: BraveSearchService,
|
| 39 |
protected crawler: CrawlerHost,
|
| 40 |
+
protected snapshotFormatter: SnapshotFormatter,
|
| 41 |
) {
|
| 42 |
super(...arguments);
|
| 43 |
}
|
|
|
|
| 326 |
if (snapshotMap.has(x)) {
|
| 327 |
return snapshotMap.get(x);
|
| 328 |
}
|
| 329 |
+
return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
|
| 330 |
r.title ??= upstreamSearchResult.title;
|
| 331 |
r.description = upstreamSearchResult.description;
|
| 332 |
snapshotMap.set(x, r);
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -2,18 +2,19 @@ import { container, singleton } from 'tsyringe';
|
|
| 2 |
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
import { Logger } from '../shared/services/logger';
|
| 4 |
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
| 5 |
-
import { JSDOM, VirtualConsole } from 'jsdom';
|
| 6 |
import { Readability } from '@mozilla/readability';
|
| 7 |
import TurndownService from 'turndown';
|
|
|
|
| 8 |
|
| 9 |
-
const
|
| 10 |
-
virtualConsole.on('error', () => void 0);
|
| 11 |
|
| 12 |
@singleton()
|
| 13 |
export class JSDomControl extends AsyncService {
|
| 14 |
|
| 15 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 16 |
|
|
|
|
|
|
|
| 17 |
constructor(
|
| 18 |
protected globalLogger: Logger,
|
| 19 |
) {
|
|
@@ -22,22 +23,34 @@ export class JSDomControl extends AsyncService {
|
|
| 22 |
|
| 23 |
override async init() {
|
| 24 |
await this.dependencyReady();
|
|
|
|
| 25 |
this.emit('ready');
|
| 26 |
}
|
| 27 |
|
| 28 |
-
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 29 |
targetSelector?: string | string[];
|
| 30 |
removeSelector?: string | string[];
|
| 31 |
withIframe?: boolean;
|
| 32 |
-
})
|
| 33 |
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
| 34 |
return snapshot;
|
| 35 |
}
|
| 36 |
if (!snapshot?.html) {
|
| 37 |
return snapshot;
|
| 38 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
const t0 = Date.now();
|
| 40 |
-
const jsdom =
|
| 41 |
const allNodes: Node[] = [];
|
| 42 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 43 |
if (options?.withIframe) {
|
|
@@ -90,16 +103,16 @@ export class JSDomControl extends AsyncService {
|
|
| 90 |
let rootDoc: Document;
|
| 91 |
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
| 92 |
rootDoc = allNodes[0] as any;
|
| 93 |
-
if (rootDoc.body.
|
| 94 |
-
textChunks.push(rootDoc.body.
|
| 95 |
}
|
| 96 |
} else {
|
| 97 |
-
rootDoc =
|
| 98 |
for (const n of allNodes) {
|
| 99 |
rootDoc.body.appendChild(n);
|
| 100 |
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
| 101 |
-
if (n.
|
| 102 |
-
textChunks.push(n.
|
| 103 |
}
|
| 104 |
}
|
| 105 |
}
|
|
@@ -111,11 +124,6 @@ export class JSDomControl extends AsyncService {
|
|
| 111 |
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 112 |
}
|
| 113 |
|
| 114 |
-
// No innerText in jsdom
|
| 115 |
-
// https://github.com/jsdom/jsdom/issues/1245
|
| 116 |
-
const textContent = textChunks.join('\n\n');
|
| 117 |
-
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
| 118 |
-
|
| 119 |
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
| 120 |
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
| 121 |
.flat()
|
|
@@ -135,7 +143,7 @@ export class JSDomControl extends AsyncService {
|
|
| 135 |
title: snapshot.title || jsdom.window.document.title,
|
| 136 |
parsed,
|
| 137 |
html: rootDoc.documentElement.outerHTML,
|
| 138 |
-
text:
|
| 139 |
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 140 |
} as PageSnapshot;
|
| 141 |
|
|
@@ -147,11 +155,13 @@ export class JSDomControl extends AsyncService {
|
|
| 147 |
return r;
|
| 148 |
}
|
| 149 |
|
|
|
|
| 150 |
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
| 151 |
const t0 = Date.now();
|
| 152 |
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 153 |
try {
|
| 154 |
-
const jsdom =
|
|
|
|
| 155 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 156 |
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
| 157 |
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
|
|
@@ -207,9 +217,8 @@ export class JSDomControl extends AsyncService {
|
|
| 207 |
|
| 208 |
return extendedSnapshot;
|
| 209 |
}
|
| 210 |
-
|
| 211 |
snippetToElement(snippet?: string, url?: string) {
|
| 212 |
-
const parsed =
|
| 213 |
|
| 214 |
return parsed.window.document.documentElement;
|
| 215 |
}
|
|
|
|
| 2 |
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
import { Logger } from '../shared/services/logger';
|
| 4 |
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
|
|
|
| 5 |
import { Readability } from '@mozilla/readability';
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
+
import { Threaded } from '../shared/services/threaded';
|
| 8 |
|
| 9 |
+
const pLinkedom = import('linkedom');
|
|
|
|
| 10 |
|
| 11 |
@singleton()
|
| 12 |
export class JSDomControl extends AsyncService {
|
| 13 |
|
| 14 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 15 |
|
| 16 |
+
linkedom!: Awaited<typeof pLinkedom>;
|
| 17 |
+
|
| 18 |
constructor(
|
| 19 |
protected globalLogger: Logger,
|
| 20 |
) {
|
|
|
|
| 23 |
|
| 24 |
override async init() {
|
| 25 |
await this.dependencyReady();
|
| 26 |
+
this.linkedom = await pLinkedom;
|
| 27 |
this.emit('ready');
|
| 28 |
}
|
| 29 |
|
| 30 |
+
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 31 |
targetSelector?: string | string[];
|
| 32 |
removeSelector?: string | string[];
|
| 33 |
withIframe?: boolean;
|
| 34 |
+
}) {
|
| 35 |
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
| 36 |
return snapshot;
|
| 37 |
}
|
| 38 |
if (!snapshot?.html) {
|
| 39 |
return snapshot;
|
| 40 |
}
|
| 41 |
+
|
| 42 |
+
return this.actualNarrowSnapshot(snapshot, options);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
@Threaded()
|
| 46 |
+
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
|
| 47 |
+
targetSelector?: string | string[];
|
| 48 |
+
removeSelector?: string | string[];
|
| 49 |
+
withIframe?: boolean;
|
| 50 |
+
}): Promise<PageSnapshot | undefined> {
|
| 51 |
+
|
| 52 |
const t0 = Date.now();
|
| 53 |
+
const jsdom = this.linkedom.parseHTML(snapshot.html);
|
| 54 |
const allNodes: Node[] = [];
|
| 55 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 56 |
if (options?.withIframe) {
|
|
|
|
| 103 |
let rootDoc: Document;
|
| 104 |
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
| 105 |
rootDoc = allNodes[0] as any;
|
| 106 |
+
if (rootDoc.body.innerText) {
|
| 107 |
+
textChunks.push(rootDoc.body.innerText);
|
| 108 |
}
|
| 109 |
} else {
|
| 110 |
+
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
|
| 111 |
for (const n of allNodes) {
|
| 112 |
rootDoc.body.appendChild(n);
|
| 113 |
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
| 114 |
+
if ((n as HTMLElement).innerText) {
|
| 115 |
+
textChunks.push((n as HTMLElement).innerText);
|
| 116 |
}
|
| 117 |
}
|
| 118 |
}
|
|
|
|
| 124 |
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 125 |
}
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
| 128 |
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
| 129 |
.flat()
|
|
|
|
| 143 |
title: snapshot.title || jsdom.window.document.title,
|
| 144 |
parsed,
|
| 145 |
html: rootDoc.documentElement.outerHTML,
|
| 146 |
+
text: textChunks.join('\n'),
|
| 147 |
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 148 |
} as PageSnapshot;
|
| 149 |
|
|
|
|
| 155 |
return r;
|
| 156 |
}
|
| 157 |
|
| 158 |
+
@Threaded()
|
| 159 |
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
| 160 |
const t0 = Date.now();
|
| 161 |
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
| 162 |
try {
|
| 163 |
+
const jsdom = this.linkedom.parseHTML(snapshot.html);
|
| 164 |
+
|
| 165 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 166 |
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
| 167 |
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
|
|
|
|
| 217 |
|
| 218 |
return extendedSnapshot;
|
| 219 |
}
|
|
|
|
| 220 |
snippetToElement(snippet?: string, url?: string) {
|
| 221 |
+
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
| 222 |
|
| 223 |
return parsed.window.document.documentElement;
|
| 224 |
}
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os from 'os';
|
| 2 |
import fs from 'fs';
|
| 3 |
import { container, singleton } from 'tsyringe';
|
| 4 |
-
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay,
|
| 5 |
import { Logger } from '../shared/services/logger';
|
| 6 |
|
| 7 |
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
|
@@ -207,7 +207,6 @@ export class PuppeteerControl extends AsyncService {
|
|
| 207 |
browser!: Browser;
|
| 208 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 209 |
|
| 210 |
-
private __healthCheckInterval?: NodeJS.Timeout;
|
| 211 |
private __reqCapInterval?: NodeJS.Timeout;
|
| 212 |
|
| 213 |
__loadedPage: Page[] = [];
|
|
@@ -217,7 +216,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 217 |
livePages = new Set<Page>();
|
| 218 |
lastPageCratedAt: number = 0;
|
| 219 |
|
| 220 |
-
rpsCap: number =
|
| 221 |
lastReqSentAt: number = 0;
|
| 222 |
requestDeferredQueue: Deferred<boolean>[] = [];
|
| 223 |
|
|
@@ -235,15 +234,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 235 |
});
|
| 236 |
}
|
| 237 |
|
| 238 |
-
briefPages() {
|
| 239 |
-
this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
|
| 240 |
-
}
|
| 241 |
-
|
| 242 |
override async init() {
|
| 243 |
-
if (this.__healthCheckInterval) {
|
| 244 |
-
clearInterval(this.__healthCheckInterval);
|
| 245 |
-
this.__healthCheckInterval = undefined;
|
| 246 |
-
}
|
| 247 |
if (this.__reqCapInterval) {
|
| 248 |
clearInterval(this.__reqCapInterval);
|
| 249 |
this.__reqCapInterval = undefined;
|
|
@@ -276,40 +267,9 @@ export class PuppeteerControl extends AsyncService {
|
|
| 276 |
|
| 277 |
this.emit('ready');
|
| 278 |
|
| 279 |
-
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
|
| 280 |
this.newPage().then((r) => this.__loadedPage.push(r));
|
| 281 |
}
|
| 282 |
|
| 283 |
-
@maxConcurrency(1)
|
| 284 |
-
async healthCheck() {
|
| 285 |
-
if (Date.now() - this.lastPageCratedAt <= 10_000) {
|
| 286 |
-
this.briefPages();
|
| 287 |
-
return;
|
| 288 |
-
}
|
| 289 |
-
const healthyPage = await this.newPage().catch((err) => {
|
| 290 |
-
this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
|
| 291 |
-
return null;
|
| 292 |
-
});
|
| 293 |
-
|
| 294 |
-
if (healthyPage) {
|
| 295 |
-
this.__loadedPage.push(healthyPage);
|
| 296 |
-
|
| 297 |
-
if (this.__loadedPage.length > 3) {
|
| 298 |
-
this.ditchPage(this.__loadedPage.shift()!);
|
| 299 |
-
}
|
| 300 |
-
|
| 301 |
-
this.briefPages();
|
| 302 |
-
|
| 303 |
-
return;
|
| 304 |
-
}
|
| 305 |
-
|
| 306 |
-
this.logger.warn(`Trying to clean up...`);
|
| 307 |
-
this.browser.process()?.kill('SIGKILL');
|
| 308 |
-
Reflect.deleteProperty(this, 'browser');
|
| 309 |
-
this.emit('crippled');
|
| 310 |
-
this.logger.warn(`Browser killed`);
|
| 311 |
-
}
|
| 312 |
-
|
| 313 |
@perNextTick()
|
| 314 |
reqCapRoutine() {
|
| 315 |
const now = Date.now();
|
|
@@ -620,7 +580,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 620 |
try {
|
| 621 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 622 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 623 |
-
screenshot = await page.screenshot();
|
| 624 |
if (snapshot) {
|
| 625 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 626 |
}
|
|
@@ -643,8 +603,8 @@ document.addEventListener('load', handlePageLoad);
|
|
| 643 |
if (salvaged) {
|
| 644 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 645 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 646 |
-
screenshot = await page.screenshot();
|
| 647 |
-
pageshot = await page.screenshot({ fullPage: true });
|
| 648 |
if (snapshot) {
|
| 649 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 650 |
}
|
|
@@ -678,8 +638,8 @@ document.addEventListener('load', handlePageLoad);
|
|
| 678 |
.then(async () => {
|
| 679 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 680 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 681 |
-
screenshot = await page.screenshot();
|
| 682 |
-
pageshot = await page.screenshot({ fullPage: true });
|
| 683 |
if (snapshot) {
|
| 684 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 685 |
}
|
|
@@ -716,8 +676,8 @@ document.addEventListener('load', handlePageLoad);
|
|
| 716 |
break;
|
| 717 |
}
|
| 718 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 719 |
-
screenshot = await page.screenshot();
|
| 720 |
-
pageshot = await page.screenshot({ fullPage: true });
|
| 721 |
lastHTML = snapshot.html;
|
| 722 |
}
|
| 723 |
if (snapshot || screenshot) {
|
|
|
|
| 1 |
import os from 'os';
|
| 2 |
import fs from 'fs';
|
| 3 |
import { container, singleton } from 'tsyringe';
|
| 4 |
+
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
| 5 |
import { Logger } from '../shared/services/logger';
|
| 6 |
|
| 7 |
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
|
|
|
| 207 |
browser!: Browser;
|
| 208 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 209 |
|
|
|
|
| 210 |
private __reqCapInterval?: NodeJS.Timeout;
|
| 211 |
|
| 212 |
__loadedPage: Page[] = [];
|
|
|
|
| 216 |
livePages = new Set<Page>();
|
| 217 |
lastPageCratedAt: number = 0;
|
| 218 |
|
| 219 |
+
rpsCap: number = 500;
|
| 220 |
lastReqSentAt: number = 0;
|
| 221 |
requestDeferredQueue: Deferred<boolean>[] = [];
|
| 222 |
|
|
|
|
| 234 |
});
|
| 235 |
}
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
override async init() {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
if (this.__reqCapInterval) {
|
| 239 |
clearInterval(this.__reqCapInterval);
|
| 240 |
this.__reqCapInterval = undefined;
|
|
|
|
| 267 |
|
| 268 |
this.emit('ready');
|
| 269 |
|
|
|
|
| 270 |
this.newPage().then((r) => this.__loadedPage.push(r));
|
| 271 |
}
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
@perNextTick()
|
| 274 |
reqCapRoutine() {
|
| 275 |
const now = Date.now();
|
|
|
|
| 580 |
try {
|
| 581 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 582 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 583 |
+
screenshot = Buffer.from(await page.screenshot());
|
| 584 |
if (snapshot) {
|
| 585 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 586 |
}
|
|
|
|
| 603 |
if (salvaged) {
|
| 604 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 605 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 606 |
+
screenshot = Buffer.from(await page.screenshot());
|
| 607 |
+
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
| 608 |
if (snapshot) {
|
| 609 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 610 |
}
|
|
|
|
| 638 |
.then(async () => {
|
| 639 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 640 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 641 |
+
screenshot = Buffer.from(await page.screenshot());
|
| 642 |
+
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
| 643 |
if (snapshot) {
|
| 644 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 645 |
}
|
|
|
|
| 676 |
break;
|
| 677 |
}
|
| 678 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 679 |
+
screenshot = Buffer.from(await page.screenshot());
|
| 680 |
+
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
| 681 |
lastHTML = snapshot.html;
|
| 682 |
}
|
| 683 |
if (snapshot || screenshot) {
|
backend/functions/src/services/snapshot-formatter.ts
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { randomUUID } from 'crypto';
|
| 2 |
+
import { container, singleton } from 'tsyringe';
|
| 3 |
+
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
|
| 4 |
+
import TurndownService from 'turndown';
|
| 5 |
+
import { Logger } from '../shared/services/logger';
|
| 6 |
+
import { PageSnapshot } from './puppeteer';
|
| 7 |
+
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 8 |
+
import { AsyncContext } from '../shared/services/async-context';
|
| 9 |
+
import { Threaded } from '../shared/services/threaded';
|
| 10 |
+
import { JSDomControl } from './jsdom';
|
| 11 |
+
import { AltTextService } from './alt-text';
|
| 12 |
+
import { PDFExtractor } from './pdf-extract';
|
| 13 |
+
import { cleanAttribute } from '../utils/misc';
|
| 14 |
+
import _ from 'lodash';
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
export interface FormattedPage {
|
| 18 |
+
title?: string;
|
| 19 |
+
description?: string;
|
| 20 |
+
url?: string;
|
| 21 |
+
content?: string;
|
| 22 |
+
publishedTime?: string;
|
| 23 |
+
html?: string;
|
| 24 |
+
text?: string;
|
| 25 |
+
screenshotUrl?: string;
|
| 26 |
+
screenshot?: Buffer;
|
| 27 |
+
pageshotUrl?: string;
|
| 28 |
+
pageshot?: Buffer;
|
| 29 |
+
links?: { [k: string]: string; };
|
| 30 |
+
images?: { [k: string]: string; };
|
| 31 |
+
usage?: {
|
| 32 |
+
total_tokens?: number;
|
| 33 |
+
totalTokens?: number;
|
| 34 |
+
tokens?: number;
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
textRepresentation?: string;
|
| 38 |
+
|
| 39 |
+
[Symbol.dispose]: () => void;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
export const md5Hasher = new HashManager('md5', 'hex');
|
| 43 |
+
|
| 44 |
+
@singleton()
|
| 45 |
+
export class SnapshotFormatter extends AsyncService {
|
| 46 |
+
|
| 47 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 48 |
+
|
| 49 |
+
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
| 50 |
+
|
| 51 |
+
constructor(
|
| 52 |
+
protected globalLogger: Logger,
|
| 53 |
+
protected jsdomControl: JSDomControl,
|
| 54 |
+
protected altTextService: AltTextService,
|
| 55 |
+
protected pdfExtractor: PDFExtractor,
|
| 56 |
+
protected threadLocal: AsyncContext,
|
| 57 |
+
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 58 |
+
) {
|
| 59 |
+
super(...arguments);
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
override async init() {
|
| 63 |
+
await this.dependencyReady();
|
| 64 |
+
this.emit('ready');
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@Threaded()
|
| 69 |
+
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
| 70 |
+
screenshotUrl?: string;
|
| 71 |
+
pageshotUrl?: string;
|
| 72 |
+
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
| 73 |
+
const t0 = Date.now();
|
| 74 |
+
if (mode === 'screenshot') {
|
| 75 |
+
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
| 76 |
+
const fid = `instant-screenshots/${randomUUID()}`;
|
| 77 |
+
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
|
| 78 |
+
metadata: {
|
| 79 |
+
contentType: 'image/png',
|
| 80 |
+
}
|
| 81 |
+
});
|
| 82 |
+
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
const f = {
|
| 86 |
+
...this.getGeneralSnapshotMixins(snapshot),
|
| 87 |
+
// html: snapshot.html,
|
| 88 |
+
screenshotUrl: snapshot.screenshotUrl,
|
| 89 |
+
};
|
| 90 |
+
|
| 91 |
+
Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false });
|
| 92 |
+
|
| 93 |
+
const dt = Date.now() - t0;
|
| 94 |
+
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 95 |
+
|
| 96 |
+
return f as FormattedPage;
|
| 97 |
+
}
|
| 98 |
+
if (mode === 'pageshot') {
|
| 99 |
+
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
| 100 |
+
const fid = `instant-screenshots/${randomUUID()}`;
|
| 101 |
+
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
| 102 |
+
metadata: {
|
| 103 |
+
contentType: 'image/png',
|
| 104 |
+
}
|
| 105 |
+
});
|
| 106 |
+
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
const f = {
|
| 110 |
+
...this.getGeneralSnapshotMixins(snapshot),
|
| 111 |
+
html: snapshot.html,
|
| 112 |
+
pageshotUrl: snapshot.pageshotUrl,
|
| 113 |
+
} as FormattedPage;
|
| 114 |
+
|
| 115 |
+
Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false });
|
| 116 |
+
|
| 117 |
+
const dt = Date.now() - t0;
|
| 118 |
+
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 119 |
+
|
| 120 |
+
return f;
|
| 121 |
+
}
|
| 122 |
+
if (mode === 'html') {
|
| 123 |
+
const f = {
|
| 124 |
+
...this.getGeneralSnapshotMixins(snapshot),
|
| 125 |
+
html: snapshot.html,
|
| 126 |
+
} as FormattedPage;
|
| 127 |
+
|
| 128 |
+
Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false });
|
| 129 |
+
|
| 130 |
+
const dt = Date.now() - t0;
|
| 131 |
+
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 132 |
+
|
| 133 |
+
return f;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
let pdfMode = false;
|
| 137 |
+
if (snapshot.pdfs?.length && !snapshot.title) {
|
| 138 |
+
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
| 139 |
+
this.threadLocal.get('cacheTolerance')
|
| 140 |
+
);
|
| 141 |
+
if (pdf) {
|
| 142 |
+
pdfMode = true;
|
| 143 |
+
snapshot.title = pdf.meta?.Title;
|
| 144 |
+
snapshot.text = pdf.text || snapshot.text;
|
| 145 |
+
snapshot.parsed = {
|
| 146 |
+
content: pdf.content,
|
| 147 |
+
textContent: pdf.content,
|
| 148 |
+
length: pdf.content?.length,
|
| 149 |
+
byline: pdf.meta?.Author,
|
| 150 |
+
lang: pdf.meta?.Language || undefined,
|
| 151 |
+
title: pdf.meta?.Title,
|
| 152 |
+
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
| 153 |
+
};
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
if (mode === 'text') {
|
| 158 |
+
const f = {
|
| 159 |
+
...this.getGeneralSnapshotMixins(snapshot),
|
| 160 |
+
text: snapshot.text,
|
| 161 |
+
} as FormattedPage;
|
| 162 |
+
|
| 163 |
+
Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false });
|
| 164 |
+
|
| 165 |
+
const dt = Date.now() - t0;
|
| 166 |
+
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 167 |
+
|
| 168 |
+
return f;
|
| 169 |
+
}
|
| 170 |
+
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
| 171 |
+
|
| 172 |
+
let contentText = '';
|
| 173 |
+
const imageSummary = {} as { [k: string]: string; };
|
| 174 |
+
const imageIdxTrack = new Map<string, number[]>();
|
| 175 |
+
const uid = this.threadLocal.get('uid');
|
| 176 |
+
do {
|
| 177 |
+
if (pdfMode) {
|
| 178 |
+
contentText = snapshot.parsed?.content || snapshot.text;
|
| 179 |
+
break;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
if (
|
| 183 |
+
snapshot.maxElemDepth! > 256 ||
|
| 184 |
+
(!uid && snapshot.elemCount! > 10_000) ||
|
| 185 |
+
snapshot.elemCount! > 70_000
|
| 186 |
+
) {
|
| 187 |
+
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
| 188 |
+
contentText = snapshot.text;
|
| 189 |
+
break;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 193 |
+
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 194 |
+
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 195 |
+
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 196 |
+
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
| 197 |
+
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
| 198 |
+
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
| 199 |
+
|
| 200 |
+
// If Readability did its job
|
| 201 |
+
if (par2.length >= 0.3 * par1.length) {
|
| 202 |
+
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 203 |
+
if (snapshot.parsed.content) {
|
| 204 |
+
toBeTurnedToMd = jsDomElementOfParsed;
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
for (const plugin of this.turnDownPlugins) {
|
| 210 |
+
turnDownService = turnDownService.use(plugin);
|
| 211 |
+
}
|
| 212 |
+
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
| 213 |
+
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
| 214 |
+
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
| 215 |
+
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
| 216 |
+
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
| 217 |
+
return undefined;
|
| 218 |
+
});
|
| 219 |
+
if (r && x.src) {
|
| 220 |
+
urlToAltMap[x.src.trim()] = r;
|
| 221 |
+
}
|
| 222 |
+
});
|
| 223 |
+
|
| 224 |
+
await Promise.all(tasks);
|
| 225 |
+
}
|
| 226 |
+
let imgIdx = 0;
|
| 227 |
+
turnDownService.addRule('img-generated-alt', {
|
| 228 |
+
filter: 'img',
|
| 229 |
+
replacement: (_content, node: any) => {
|
| 230 |
+
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
| 231 |
+
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
| 232 |
+
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
| 233 |
+
if (dataSrc && !dataSrc.startsWith('data:')) {
|
| 234 |
+
linkPreferredSrc = dataSrc;
|
| 235 |
+
}
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
let src;
|
| 239 |
+
try {
|
| 240 |
+
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
| 241 |
+
} catch (_err) {
|
| 242 |
+
void 0;
|
| 243 |
+
}
|
| 244 |
+
const alt = cleanAttribute(node.getAttribute('alt'));
|
| 245 |
+
if (!src) {
|
| 246 |
+
return '';
|
| 247 |
+
}
|
| 248 |
+
const mapped = urlToAltMap[src];
|
| 249 |
+
const imgSerial = ++imgIdx;
|
| 250 |
+
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
| 251 |
+
idxArr.push(imgSerial);
|
| 252 |
+
imageIdxTrack.set(src, idxArr);
|
| 253 |
+
|
| 254 |
+
if (mapped) {
|
| 255 |
+
imageSummary[src] = mapped || alt;
|
| 256 |
+
|
| 257 |
+
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 258 |
+
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 259 |
+
mappedUrl.protocol = 'blob:';
|
| 260 |
+
|
| 261 |
+
return ``;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
return ``;
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
imageSummary[src] = alt || '';
|
| 268 |
+
|
| 269 |
+
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
| 270 |
+
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
| 271 |
+
mappedUrl.protocol = 'blob:';
|
| 272 |
+
|
| 273 |
+
return alt ? `` : ``;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
return alt ? `` : ``;
|
| 277 |
+
}
|
| 278 |
+
});
|
| 279 |
+
|
| 280 |
+
if (toBeTurnedToMd) {
|
| 281 |
+
try {
|
| 282 |
+
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
| 283 |
+
} catch (err) {
|
| 284 |
+
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 285 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 286 |
+
try {
|
| 287 |
+
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
| 288 |
+
} catch (err2) {
|
| 289 |
+
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
if (
|
| 295 |
+
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
| 296 |
+
&& toBeTurnedToMd !== jsDomElementOfHTML
|
| 297 |
+
) {
|
| 298 |
+
try {
|
| 299 |
+
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
| 300 |
+
} catch (err) {
|
| 301 |
+
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 302 |
+
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 303 |
+
try {
|
| 304 |
+
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
| 305 |
+
} catch (err2) {
|
| 306 |
+
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 307 |
+
}
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
| 311 |
+
contentText = snapshot.text;
|
| 312 |
+
}
|
| 313 |
+
} while (false);
|
| 314 |
+
|
| 315 |
+
const cleanText = (contentText || '').trim();
|
| 316 |
+
|
| 317 |
+
const formatted: FormattedPage = {
|
| 318 |
+
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 319 |
+
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
| 320 |
+
content: cleanText,
|
| 321 |
+
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
| 322 |
+
[Symbol.dispose]: () => { },
|
| 323 |
+
};
|
| 324 |
+
|
| 325 |
+
if (this.threadLocal.get('withImagesSummary')) {
|
| 326 |
+
formatted.images =
|
| 327 |
+
_(imageSummary)
|
| 328 |
+
.toPairs()
|
| 329 |
+
.map(
|
| 330 |
+
([url, alt], i) => {
|
| 331 |
+
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
| 332 |
+
}
|
| 333 |
+
).fromPairs()
|
| 334 |
+
.value();
|
| 335 |
+
}
|
| 336 |
+
if (this.threadLocal.get('withLinksSummary')) {
|
| 337 |
+
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
const textRepresentation = (function (this: typeof formatted) {
|
| 341 |
+
if (mode === 'markdown') {
|
| 342 |
+
return this.content as string;
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
const mixins = [];
|
| 346 |
+
if (this.publishedTime) {
|
| 347 |
+
mixins.push(`Published Time: ${this.publishedTime}`);
|
| 348 |
+
}
|
| 349 |
+
const suffixMixins = [];
|
| 350 |
+
if (this.images) {
|
| 351 |
+
const imageSummaryChunks = ['Images:'];
|
| 352 |
+
for (const [k, v] of Object.entries(this.images)) {
|
| 353 |
+
imageSummaryChunks.push(`- `);
|
| 354 |
+
}
|
| 355 |
+
if (imageSummaryChunks.length === 1) {
|
| 356 |
+
imageSummaryChunks.push('This page does not seem to contain any images.');
|
| 357 |
+
}
|
| 358 |
+
suffixMixins.push(imageSummaryChunks.join('\n'));
|
| 359 |
+
}
|
| 360 |
+
if (this.links) {
|
| 361 |
+
const linkSummaryChunks = ['Links/Buttons:'];
|
| 362 |
+
for (const [k, v] of Object.entries(this.links)) {
|
| 363 |
+
linkSummaryChunks.push(`- [${k}](${v})`);
|
| 364 |
+
}
|
| 365 |
+
if (linkSummaryChunks.length === 1) {
|
| 366 |
+
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
| 367 |
+
}
|
| 368 |
+
suffixMixins.push(linkSummaryChunks.join('\n'));
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
return `Title: ${this.title}
|
| 372 |
+
|
| 373 |
+
URL Source: ${this.url}
|
| 374 |
+
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
| 375 |
+
Markdown Content:
|
| 376 |
+
${this.content}
|
| 377 |
+
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
| 378 |
+
}).call(formatted);
|
| 379 |
+
|
| 380 |
+
Object.defineProperty(formatted, 'textRepresentation', { value: textRepresentation, enumerable: false });
|
| 381 |
+
|
| 382 |
+
const dt = Date.now() - t0;
|
| 383 |
+
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
| 384 |
+
|
| 385 |
+
return formatted as FormattedPage;
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
| 389 |
+
let inferred;
|
| 390 |
+
const mixin: any = {};
|
| 391 |
+
if (this.threadLocal.get('withImagesSummary')) {
|
| 392 |
+
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 393 |
+
const imageSummary = {} as { [k: string]: string; };
|
| 394 |
+
const imageIdxTrack = new Map<string, number[]>();
|
| 395 |
+
|
| 396 |
+
let imgIdx = 0;
|
| 397 |
+
|
| 398 |
+
for (const img of inferred.imgs) {
|
| 399 |
+
const imgSerial = ++imgIdx;
|
| 400 |
+
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
|
| 401 |
+
idxArr.push(imgSerial);
|
| 402 |
+
imageIdxTrack.set(img.src, idxArr);
|
| 403 |
+
imageSummary[img.src] = img.alt || '';
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
mixin.images =
|
| 407 |
+
_(imageSummary)
|
| 408 |
+
.toPairs()
|
| 409 |
+
.map(
|
| 410 |
+
([url, alt], i) => {
|
| 411 |
+
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
| 412 |
+
}
|
| 413 |
+
).fromPairs()
|
| 414 |
+
.value();
|
| 415 |
+
}
|
| 416 |
+
if (this.threadLocal.get('withLinksSummary')) {
|
| 417 |
+
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
| 418 |
+
mixin.links = _.invert(inferred.links || {});
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
return mixin;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
getTurndown(options?: {
|
| 425 |
+
noRules?: boolean | string,
|
| 426 |
+
url?: string | URL;
|
| 427 |
+
imgDataUrlToObjectUrl?: boolean;
|
| 428 |
+
}) {
|
| 429 |
+
const turnDownService = new TurndownService({
|
| 430 |
+
codeBlockStyle: 'fenced',
|
| 431 |
+
preformattedCode: true,
|
| 432 |
+
} as any);
|
| 433 |
+
if (!options?.noRules) {
|
| 434 |
+
turnDownService.addRule('remove-irrelevant', {
|
| 435 |
+
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
| 436 |
+
replacement: () => ''
|
| 437 |
+
});
|
| 438 |
+
turnDownService.addRule('truncate-svg', {
|
| 439 |
+
filter: 'svg' as any,
|
| 440 |
+
replacement: () => ''
|
| 441 |
+
});
|
| 442 |
+
turnDownService.addRule('title-as-h1', {
|
| 443 |
+
filter: ['title'],
|
| 444 |
+
replacement: (innerText) => `${innerText}\n===============\n`
|
| 445 |
+
});
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
if (options?.imgDataUrlToObjectUrl) {
|
| 449 |
+
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
| 450 |
+
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
| 451 |
+
replacement: (_content, node: any) => {
|
| 452 |
+
const src = (node.getAttribute('src') || '').trim();
|
| 453 |
+
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
| 454 |
+
|
| 455 |
+
if (options.url) {
|
| 456 |
+
const refUrl = new URL(options.url);
|
| 457 |
+
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
| 458 |
+
|
| 459 |
+
return ``;
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
return `})`;
|
| 463 |
+
}
|
| 464 |
+
});
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
turnDownService.addRule('improved-paragraph', {
|
| 468 |
+
filter: 'p',
|
| 469 |
+
replacement: (innerText) => {
|
| 470 |
+
const trimmed = innerText.trim();
|
| 471 |
+
if (!trimmed) {
|
| 472 |
+
return '';
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
|
| 476 |
+
}
|
| 477 |
+
});
|
| 478 |
+
turnDownService.addRule('improved-inline-link', {
|
| 479 |
+
filter: function (node, options) {
|
| 480 |
+
return Boolean(
|
| 481 |
+
options.linkStyle === 'inlined' &&
|
| 482 |
+
node.nodeName === 'A' &&
|
| 483 |
+
node.getAttribute('href')
|
| 484 |
+
);
|
| 485 |
+
},
|
| 486 |
+
|
| 487 |
+
replacement: function (content, node: any) {
|
| 488 |
+
let href = node.getAttribute('href');
|
| 489 |
+
if (href) href = href.replace(/([()])/g, '\\$1');
|
| 490 |
+
let title = cleanAttribute(node.getAttribute('title'));
|
| 491 |
+
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
| 492 |
+
|
| 493 |
+
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
| 494 |
+
let fixedHref = href.replace(/\s+/g, '').trim();
|
| 495 |
+
if (options?.url) {
|
| 496 |
+
try {
|
| 497 |
+
fixedHref = new URL(fixedHref, options.url).toString();
|
| 498 |
+
} catch (_err) {
|
| 499 |
+
void 0;
|
| 500 |
+
}
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
| 504 |
+
}
|
| 505 |
+
});
|
| 506 |
+
turnDownService.addRule('improved-code', {
|
| 507 |
+
filter: function (node: any) {
|
| 508 |
+
let hasSiblings = node.previousSibling || node.nextSibling;
|
| 509 |
+
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
|
| 510 |
+
|
| 511 |
+
return node.nodeName === 'CODE' && !isCodeBlock;
|
| 512 |
+
},
|
| 513 |
+
|
| 514 |
+
replacement: function (inputContent: any) {
|
| 515 |
+
if (!inputContent) return '';
|
| 516 |
+
let content = inputContent;
|
| 517 |
+
|
| 518 |
+
let delimiter = '`';
|
| 519 |
+
let matches = content.match(/`+/gm) || [];
|
| 520 |
+
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
|
| 521 |
+
if (content.includes('\n')) {
|
| 522 |
+
delimiter = '```';
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
|
| 526 |
+
|
| 527 |
+
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
|
| 528 |
+
}
|
| 529 |
+
});
|
| 530 |
+
|
| 531 |
+
return turnDownService;
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
const snapshotFormatter = container.resolve(SnapshotFormatter);
|
| 538 |
+
|
| 539 |
+
export default snapshotFormatter;
|