nomagick commited on
Commit
94170db
·
unverified ·
1 Parent(s): 5171e5f

fix: performance issue of jsdom

Browse files
backend/functions/package-lock.json CHANGED
@@ -14,7 +14,7 @@
14
  "archiver": "^6.0.1",
15
  "axios": "^1.3.3",
16
  "bcrypt": "^5.1.0",
17
- "civkit": "^0.6.5-047c0d8",
18
  "core-js": "^3.37.1",
19
  "cors": "^2.8.5",
20
  "dayjs": "^1.11.9",
@@ -23,13 +23,13 @@
23
  "firebase-functions": "^4.9.0",
24
  "htmlparser2": "^9.0.0",
25
  "jose": "^5.1.0",
26
- "jsdom": "^24.0.0",
27
  "langdetect": "^0.2.1",
 
28
  "maxmind": "^4.3.18",
29
  "minio": "^7.1.3",
30
  "openai": "^4.20.0",
31
  "pdfjs-dist": "^4.2.67",
32
- "puppeteer": "^22.7.1",
33
  "puppeteer-extra": "^3.3.6",
34
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
35
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
@@ -48,7 +48,7 @@
48
  "@types/bcrypt": "^5.0.0",
49
  "@types/cors": "^2.8.17",
50
  "@types/generic-pool": "^3.8.1",
51
- "@types/node": "^18",
52
  "@types/set-cookie-parser": "^2.4.7",
53
  "@typescript-eslint/eslint-plugin": "^5.12.0",
54
  "@typescript-eslint/parser": "^5.12.0",
@@ -57,7 +57,7 @@
57
  "eslint-plugin-import": "^2.25.4",
58
  "firebase-functions-test": "^3.0.0",
59
  "replicate": "^0.16.1",
60
- "typescript": "^5.1.6"
61
  },
62
  "engines": {
63
  "node": "20"
@@ -1564,10 +1564,9 @@
1564
  }
1565
  },
1566
  "node_modules/@mongodb-js/saslprep": {
1567
- "version": "1.1.5",
1568
- "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.1.5.tgz",
1569
- "integrity": "sha512-XLNOMH66KhJzUJNwT/qlMnS4WsNDWD5ASdyaSH3EtK+F4r/CFGa3jT4GNi4mfOitGvWXtdLgQJkQjxSVrio+jA==",
1570
- "optional": true,
1571
  "peer": true,
1572
  "dependencies": {
1573
  "sparse-bitfield": "^3.0.3"
@@ -1977,18 +1976,18 @@
1977
  "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
1978
  },
1979
  "node_modules/@puppeteer/browsers": {
1980
- "version": "2.2.3",
1981
- "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.2.3.tgz",
1982
- "integrity": "sha512-bJ0UBsk0ESOs6RFcLXOt99a3yTDcOKlzfjad+rhFwdaG1Lu/Wzq58GHYCDTlZ9z6mldf4g+NTb+TXEfe0PpnsQ==",
1983
- "dependencies": {
1984
- "debug": "4.3.4",
1985
- "extract-zip": "2.0.1",
1986
- "progress": "2.0.3",
1987
- "proxy-agent": "6.4.0",
1988
- "semver": "7.6.0",
1989
- "tar-fs": "3.0.5",
1990
- "unbzip2-stream": "1.4.3",
1991
- "yargs": "17.7.2"
1992
  },
1993
  "bin": {
1994
  "browsers": "lib/cjs/main-cli.js"
@@ -2299,9 +2298,9 @@
2299
  "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
2300
  },
2301
  "node_modules/@types/node": {
2302
- "version": "18.19.31",
2303
- "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.31.tgz",
2304
- "integrity": "sha512-ArgCD39YpyyrtFKIqMDvjz79jto5fcI/SVUs2HwB+f0dAzq68yqOdyaSivLiLugSziTpNXLQrVb7RZFmdZzbhA==",
2305
  "dependencies": {
2306
  "undici-types": "~5.26.4"
2307
  }
@@ -2424,12 +2423,11 @@
2424
  "peer": true
2425
  },
2426
  "node_modules/@types/whatwg-url": {
2427
- "version": "8.2.2",
2428
- "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-8.2.2.tgz",
2429
- "integrity": "sha512-FtQu10RWgn3D9U4aazdwIE2yzphmTJREDqNdODHrbrZmmMqI0vMheC/6NE/J1Yveaj8H+ela+YwWTjq5PGmuhA==",
2430
  "peer": true,
2431
  "dependencies": {
2432
- "@types/node": "*",
2433
  "@types/webidl-conversions": "*"
2434
  }
2435
  },
@@ -3227,31 +3225,41 @@
3227
  "optional": true
3228
  },
3229
  "node_modules/bare-fs": {
3230
- "version": "2.2.3",
3231
- "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-2.2.3.tgz",
3232
- "integrity": "sha512-amG72llr9pstfXOBOHve1WjiuKKAMnebcmMbPWDZ7BCevAoJLpugjuAPRsDINEyjT0a6tbaVx3DctkXIRbLuJw==",
3233
  "optional": true,
3234
  "dependencies": {
3235
  "bare-events": "^2.0.0",
3236
  "bare-path": "^2.0.0",
3237
- "streamx": "^2.13.0"
3238
  }
3239
  },
3240
  "node_modules/bare-os": {
3241
- "version": "2.2.1",
3242
- "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.2.1.tgz",
3243
- "integrity": "sha512-OwPyHgBBMkhC29Hl3O4/YfxW9n7mdTr2+SsO29XBWKKJsbgj3mnorDB80r5TiCQgQstgE5ga1qNYrpes6NvX2w==",
3244
  "optional": true
3245
  },
3246
  "node_modules/bare-path": {
3247
- "version": "2.1.1",
3248
- "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-2.1.1.tgz",
3249
- "integrity": "sha512-OHM+iwRDRMDBsSW7kl3dO62JyHdBKO3B25FB9vNQBPcGHMo4+eA8Yj41Lfbk3pS/seDY+siNge0LdRTulAau/A==",
3250
  "optional": true,
3251
  "dependencies": {
3252
  "bare-os": "^2.1.0"
3253
  }
3254
  },
 
 
 
 
 
 
 
 
 
 
3255
  "node_modules/base32.js": {
3256
  "version": "0.1.0",
3257
  "resolved": "https://registry.npmjs.org/base32.js/-/base32.js-0.1.0.tgz",
@@ -3374,6 +3382,11 @@
3374
  "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
3375
  "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
3376
  },
 
 
 
 
 
3377
  "node_modules/brace-expansion": {
3378
  "version": "1.1.11",
3379
  "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
@@ -3444,12 +3457,12 @@
3444
  }
3445
  },
3446
  "node_modules/bson": {
3447
- "version": "5.5.1",
3448
- "resolved": "https://registry.npmjs.org/bson/-/bson-5.5.1.tgz",
3449
- "integrity": "sha512-ix0EwukN2EpC0SRWIj/7B5+A6uQMQy6KMREI9qQqvgpkV2frH63T0UDVd1SYedL6dNCmDBYB3QtXi4ISk9YT+g==",
3450
  "peer": true,
3451
  "engines": {
3452
- "node": ">=14.20.1"
3453
  }
3454
  },
3455
  "node_modules/buffer": {
@@ -3659,13 +3672,13 @@
3659
  }
3660
  },
3661
  "node_modules/chromium-bidi": {
3662
- "version": "0.5.19",
3663
- "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.5.19.tgz",
3664
- "integrity": "sha512-UA6zL77b7RYCjJkZBsZ0wlvCTD+jTjllZ8f6wdO4buevXgTZYjV+XLB9CiEa2OuuTGGTLnI7eN9I60YxuALGQg==",
3665
  "dependencies": {
3666
  "mitt": "3.0.1",
3667
  "urlpattern-polyfill": "10.0.0",
3668
- "zod": "3.22.4"
3669
  },
3670
  "peerDependencies": {
3671
  "devtools-protocol": "*"
@@ -3688,9 +3701,9 @@
3688
  }
3689
  },
3690
  "node_modules/civkit": {
3691
- "version": "0.6.5-047c0d8",
3692
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.6.5-047c0d8.tgz",
3693
- "integrity": "sha512-4FWHrkJQHbTD3wjNeihxOzm7GSgQa9BUgSvPOLsfKybeEw9Pv+I94uDUP8PczL1TpHO6hIbIE2KJjzSOx6PYqg==",
3694
  "dependencies": {
3695
  "lodash": "^4.17.21",
3696
  "tslib": "^2.5.0"
@@ -3719,11 +3732,13 @@
3719
  "pino": "^8.11.0",
3720
  "reflect-metadata": "^0.1.13",
3721
  "smtp-server": "^3.11.0",
3722
- "tld-extract": "^2.1.0"
 
 
3723
  },
3724
  "peerDependencies": {
3725
- "mongodb": "^5.2.0",
3726
- "tsyringe": "^4.7.0"
3727
  }
3728
  },
3729
  "node_modules/cjs-module-lexer": {
@@ -4049,17 +4064,37 @@
4049
  "node": ">= 8"
4050
  }
4051
  },
4052
- "node_modules/cssstyle": {
4053
- "version": "4.0.1",
4054
- "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz",
4055
- "integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==",
4056
  "dependencies": {
4057
- "rrweb-cssom": "^0.6.0"
 
 
 
 
4058
  },
 
 
 
 
 
 
 
 
4059
  "engines": {
4060
- "node": ">=18"
 
 
 
4061
  }
4062
  },
 
 
 
 
 
4063
  "node_modules/data-uri-to-buffer": {
4064
  "version": "6.0.2",
4065
  "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
@@ -4068,41 +4103,6 @@
4068
  "node": ">= 14"
4069
  }
4070
  },
4071
- "node_modules/data-urls": {
4072
- "version": "5.0.0",
4073
- "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz",
4074
- "integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==",
4075
- "dependencies": {
4076
- "whatwg-mimetype": "^4.0.0",
4077
- "whatwg-url": "^14.0.0"
4078
- },
4079
- "engines": {
4080
- "node": ">=18"
4081
- }
4082
- },
4083
- "node_modules/data-urls/node_modules/tr46": {
4084
- "version": "5.0.0",
4085
- "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
4086
- "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
4087
- "dependencies": {
4088
- "punycode": "^2.3.1"
4089
- },
4090
- "engines": {
4091
- "node": ">=18"
4092
- }
4093
- },
4094
- "node_modules/data-urls/node_modules/whatwg-url": {
4095
- "version": "14.0.0",
4096
- "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
4097
- "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
4098
- "dependencies": {
4099
- "tr46": "^5.0.0",
4100
- "webidl-conversions": "^7.0.0"
4101
- },
4102
- "engines": {
4103
- "node": ">=18"
4104
- }
4105
- },
4106
  "node_modules/data-view-buffer": {
4107
  "version": "1.0.1",
4108
  "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
@@ -4160,11 +4160,11 @@
4160
  "integrity": "sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ=="
4161
  },
4162
  "node_modules/debug": {
4163
- "version": "4.3.4",
4164
- "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
4165
- "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
4166
  "dependencies": {
4167
- "ms": "2.1.2"
4168
  },
4169
  "engines": {
4170
  "node": ">=6.0"
@@ -4175,11 +4175,6 @@
4175
  }
4176
  }
4177
  },
4178
- "node_modules/decimal.js": {
4179
- "version": "10.4.3",
4180
- "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz",
4181
- "integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA=="
4182
- },
4183
  "node_modules/decode-uri-component": {
4184
  "version": "0.2.2",
4185
  "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
@@ -4359,9 +4354,9 @@
4359
  }
4360
  },
4361
  "node_modules/devtools-protocol": {
4362
- "version": "0.0.1273771",
4363
- "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1273771.tgz",
4364
- "integrity": "sha512-QDbb27xcTVReQQW/GHJsdQqGKwYBE7re7gxehj467kKP2DKuYBUj6i2k5LRiAC66J1yZG/9gsxooz/s9pcm0Og=="
4365
  },
4366
  "node_modules/diff-sequences": {
4367
  "version": "29.6.3",
@@ -5461,14 +5456,6 @@
5461
  "@google-cloud/storage": "^7.7.0"
5462
  }
5463
  },
5464
- "node_modules/firebase-admin/node_modules/@types/node": {
5465
- "version": "20.12.7",
5466
- "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz",
5467
- "integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==",
5468
- "dependencies": {
5469
- "undici-types": "~5.26.4"
5470
- }
5471
- },
5472
  "node_modules/firebase-functions": {
5473
  "version": "4.9.0",
5474
  "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-4.9.0.tgz",
@@ -5793,15 +5780,33 @@
5793
  }
5794
  },
5795
  "node_modules/gcp-metadata": {
5796
- "version": "6.1.0",
5797
- "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.0.tgz",
5798
- "integrity": "sha512-Jh/AIwwgaxan+7ZUUmRLCjtchyDiqh4KjBJ5tW3plBZb5iL/BPcso8A5DlzeD9qlw0duCamnNdpFjxwaT0KyKg==",
 
 
5799
  "dependencies": {
5800
- "gaxios": "^6.0.0",
5801
  "json-bigint": "^1.0.0"
5802
  },
5803
  "engines": {
5804
- "node": ">=14"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5805
  }
5806
  },
5807
  "node_modules/generic-pool": {
@@ -6023,6 +6028,18 @@
6023
  "node": ">=14"
6024
  }
6025
  },
 
 
 
 
 
 
 
 
 
 
 
 
6026
  "node_modules/google-gax": {
6027
  "version": "4.3.2",
6028
  "resolved": "https://registry.npmjs.org/google-gax/-/google-gax-4.3.2.tgz",
@@ -6184,17 +6201,6 @@
6184
  "node": ">= 0.4"
6185
  }
6186
  },
6187
- "node_modules/html-encoding-sniffer": {
6188
- "version": "4.0.0",
6189
- "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
6190
- "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
6191
- "dependencies": {
6192
- "whatwg-encoding": "^3.1.1"
6193
- },
6194
- "engines": {
6195
- "node": ">=18"
6196
- }
6197
- },
6198
  "node_modules/html-escaper": {
6199
  "version": "2.0.2",
6200
  "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
@@ -6383,6 +6389,7 @@
6383
  "version": "0.6.3",
6384
  "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
6385
  "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
 
6386
  "dependencies": {
6387
  "safer-buffer": ">= 2.1.2 < 3.0.0"
6388
  },
@@ -6780,11 +6787,6 @@
6780
  "node": ">=0.10.0"
6781
  }
6782
  },
6783
- "node_modules/is-potential-custom-element-name": {
6784
- "version": "1.0.1",
6785
- "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
6786
- "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="
6787
- },
6788
  "node_modules/is-regex": {
6789
  "version": "1.1.4",
6790
  "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
@@ -7653,91 +7655,6 @@
7653
  "node": ">=0.1.90"
7654
  }
7655
  },
7656
- "node_modules/jsdom": {
7657
- "version": "24.0.0",
7658
- "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz",
7659
- "integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==",
7660
- "dependencies": {
7661
- "cssstyle": "^4.0.1",
7662
- "data-urls": "^5.0.0",
7663
- "decimal.js": "^10.4.3",
7664
- "form-data": "^4.0.0",
7665
- "html-encoding-sniffer": "^4.0.0",
7666
- "http-proxy-agent": "^7.0.0",
7667
- "https-proxy-agent": "^7.0.2",
7668
- "is-potential-custom-element-name": "^1.0.1",
7669
- "nwsapi": "^2.2.7",
7670
- "parse5": "^7.1.2",
7671
- "rrweb-cssom": "^0.6.0",
7672
- "saxes": "^6.0.0",
7673
- "symbol-tree": "^3.2.4",
7674
- "tough-cookie": "^4.1.3",
7675
- "w3c-xmlserializer": "^5.0.0",
7676
- "webidl-conversions": "^7.0.0",
7677
- "whatwg-encoding": "^3.1.1",
7678
- "whatwg-mimetype": "^4.0.0",
7679
- "whatwg-url": "^14.0.0",
7680
- "ws": "^8.16.0",
7681
- "xml-name-validator": "^5.0.0"
7682
- },
7683
- "engines": {
7684
- "node": ">=18"
7685
- },
7686
- "peerDependencies": {
7687
- "canvas": "^2.11.2"
7688
- },
7689
- "peerDependenciesMeta": {
7690
- "canvas": {
7691
- "optional": true
7692
- }
7693
- }
7694
- },
7695
- "node_modules/jsdom/node_modules/agent-base": {
7696
- "version": "7.1.1",
7697
- "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz",
7698
- "integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==",
7699
- "dependencies": {
7700
- "debug": "^4.3.4"
7701
- },
7702
- "engines": {
7703
- "node": ">= 14"
7704
- }
7705
- },
7706
- "node_modules/jsdom/node_modules/https-proxy-agent": {
7707
- "version": "7.0.4",
7708
- "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz",
7709
- "integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==",
7710
- "dependencies": {
7711
- "agent-base": "^7.0.2",
7712
- "debug": "4"
7713
- },
7714
- "engines": {
7715
- "node": ">= 14"
7716
- }
7717
- },
7718
- "node_modules/jsdom/node_modules/tr46": {
7719
- "version": "5.0.0",
7720
- "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
7721
- "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
7722
- "dependencies": {
7723
- "punycode": "^2.3.1"
7724
- },
7725
- "engines": {
7726
- "node": ">=18"
7727
- }
7728
- },
7729
- "node_modules/jsdom/node_modules/whatwg-url": {
7730
- "version": "14.0.0",
7731
- "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
7732
- "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
7733
- "dependencies": {
7734
- "tr46": "^5.0.0",
7735
- "webidl-conversions": "^7.0.0"
7736
- },
7737
- "engines": {
7738
- "node": ">=18"
7739
- }
7740
- },
7741
  "node_modules/jsesc": {
7742
  "version": "2.5.2",
7743
  "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
@@ -8156,6 +8073,23 @@
8156
  "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
8157
  "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg=="
8158
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8159
  "node_modules/locate-path": {
8160
  "version": "6.0.0",
8161
  "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
@@ -8335,7 +8269,6 @@
8335
  "version": "1.5.0",
8336
  "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz",
8337
  "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==",
8338
- "optional": true,
8339
  "peer": true
8340
  },
8341
  "node_modules/merge-deep": {
@@ -8564,27 +8497,26 @@
8564
  }
8565
  },
8566
  "node_modules/mongodb": {
8567
- "version": "5.9.2",
8568
- "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
8569
- "integrity": "sha512-H60HecKO4Bc+7dhOv4sJlgvenK4fQNqqUIlXxZYQNbfEWSALGAwGoyJd/0Qwk4TttFXUOHJ2ZJQe/52ScaUwtQ==",
8570
  "peer": true,
8571
  "dependencies": {
8572
- "bson": "^5.5.0",
8573
- "mongodb-connection-string-url": "^2.6.0",
8574
- "socks": "^2.7.1"
8575
  },
8576
  "engines": {
8577
- "node": ">=14.20.1"
8578
- },
8579
- "optionalDependencies": {
8580
- "@mongodb-js/saslprep": "^1.1.0"
8581
  },
8582
  "peerDependencies": {
8583
  "@aws-sdk/credential-providers": "^3.188.0",
8584
- "@mongodb-js/zstd": "^1.0.0",
8585
- "kerberos": "^1.0.0 || ^2.0.0",
8586
- "mongodb-client-encryption": ">=2.3.0 <3",
8587
- "snappy": "^7.2.2"
 
 
8588
  },
8589
  "peerDependenciesMeta": {
8590
  "@aws-sdk/credential-providers": {
@@ -8593,6 +8525,9 @@
8593
  "@mongodb-js/zstd": {
8594
  "optional": true
8595
  },
 
 
 
8596
  "kerberos": {
8597
  "optional": true
8598
  },
@@ -8601,23 +8536,26 @@
8601
  },
8602
  "snappy": {
8603
  "optional": true
 
 
 
8604
  }
8605
  }
8606
  },
8607
  "node_modules/mongodb-connection-string-url": {
8608
- "version": "2.6.0",
8609
- "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-2.6.0.tgz",
8610
- "integrity": "sha512-WvTZlI9ab0QYtTYnuMLgobULWhokRjtC7db9LtcVfJ+Hsnyr5eo6ZtNAt3Ly24XZScGMelOcGtm7lSn0332tPQ==",
8611
  "peer": true,
8612
  "dependencies": {
8613
- "@types/whatwg-url": "^8.2.1",
8614
- "whatwg-url": "^11.0.0"
8615
  }
8616
  },
8617
  "node_modules/ms": {
8618
- "version": "2.1.2",
8619
- "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
8620
- "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w=="
8621
  },
8622
  "node_modules/napi-build-utils": {
8623
  "version": "1.0.2",
@@ -8836,10 +8774,16 @@
8836
  "set-blocking": "^2.0.0"
8837
  }
8838
  },
8839
- "node_modules/nwsapi": {
8840
- "version": "2.2.10",
8841
- "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.10.tgz",
8842
- "integrity": "sha512-QK0sRs7MKv0tKe1+5uZIQk/C8XGza4DAnztJG8iD+TpJIORARrCxczA738awHrZoHeTjSSoHqao2teO0dC/gFQ=="
 
 
 
 
 
 
8843
  },
8844
  "node_modules/object-assign": {
8845
  "version": "4.1.1",
@@ -9009,6 +8953,14 @@
9009
  "openai": "bin/cli"
9010
  }
9011
  },
 
 
 
 
 
 
 
 
9012
  "node_modules/optionator": {
9013
  "version": "0.9.3",
9014
  "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
@@ -9155,17 +9107,6 @@
9155
  "url": "https://github.com/sponsors/sindresorhus"
9156
  }
9157
  },
9158
- "node_modules/parse5": {
9159
- "version": "7.1.2",
9160
- "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
9161
- "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==",
9162
- "dependencies": {
9163
- "entities": "^4.4.0"
9164
- },
9165
- "funding": {
9166
- "url": "https://github.com/inikulin/parse5?sponsor=1"
9167
- }
9168
- },
9169
  "node_modules/parseurl": {
9170
  "version": "1.3.3",
9171
  "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
@@ -9691,33 +9632,36 @@
9691
  }
9692
  },
9693
  "node_modules/puppeteer": {
9694
- "version": "22.7.1",
9695
- "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-22.7.1.tgz",
9696
- "integrity": "sha512-JBCBCwQ9+dyPp5haqeecgv0N0vgWFx44woUeKJaPeJT8CU3RXrd8F/tqJQbuAmcWlbMhYJSlTJkIFrwVAs6BNA==",
9697
  "hasInstallScript": true,
9698
  "dependencies": {
9699
- "@puppeteer/browsers": "2.2.3",
9700
- "cosmiconfig": "9.0.0",
9701
- "devtools-protocol": "0.0.1273771",
9702
- "puppeteer-core": "22.7.1"
 
 
9703
  },
9704
  "bin": {
9705
- "puppeteer": "lib/esm/puppeteer/node/cli.js"
9706
  },
9707
  "engines": {
9708
  "node": ">=18"
9709
  }
9710
  },
9711
  "node_modules/puppeteer-core": {
9712
- "version": "22.7.1",
9713
- "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-22.7.1.tgz",
9714
- "integrity": "sha512-jD7T7yN7PWGuJmNT0TAEboA26s0VVnvbgCxqgQIF+eNQW2u71ENaV2JwzSJiCHO+e72H4Ue6AgKD9USQ8xAcOQ==",
9715
  "dependencies": {
9716
- "@puppeteer/browsers": "2.2.3",
9717
- "chromium-bidi": "0.5.19",
9718
- "debug": "4.3.4",
9719
- "devtools-protocol": "0.0.1273771",
9720
- "ws": "8.16.0"
 
9721
  },
9722
  "engines": {
9723
  "node": ">=18"
@@ -10378,11 +10322,6 @@
10378
  "url": "https://github.com/sponsors/isaacs"
10379
  }
10380
  },
10381
- "node_modules/rrweb-cssom": {
10382
- "version": "0.6.0",
10383
- "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
10384
- "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw=="
10385
- },
10386
  "node_modules/run-parallel": {
10387
  "version": "1.2.0",
10388
  "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
@@ -10479,24 +10418,10 @@
10479
  "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
10480
  "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
10481
  },
10482
- "node_modules/saxes": {
10483
- "version": "6.0.0",
10484
- "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz",
10485
- "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==",
10486
- "dependencies": {
10487
- "xmlchars": "^2.2.0"
10488
- },
10489
- "engines": {
10490
- "node": ">=v12.22.7"
10491
- }
10492
- },
10493
  "node_modules/semver": {
10494
- "version": "7.6.0",
10495
- "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
10496
- "integrity": "sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==",
10497
- "dependencies": {
10498
- "lru-cache": "^6.0.0"
10499
- },
10500
  "bin": {
10501
  "semver": "bin/semver.js"
10502
  },
@@ -10504,22 +10429,6 @@
10504
  "node": ">=10"
10505
  }
10506
  },
10507
- "node_modules/semver/node_modules/lru-cache": {
10508
- "version": "6.0.0",
10509
- "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
10510
- "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
10511
- "dependencies": {
10512
- "yallist": "^4.0.0"
10513
- },
10514
- "engines": {
10515
- "node": ">=10"
10516
- }
10517
- },
10518
- "node_modules/semver/node_modules/yallist": {
10519
- "version": "4.0.0",
10520
- "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz",
10521
- "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A=="
10522
- },
10523
  "node_modules/send": {
10524
  "version": "0.18.0",
10525
  "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
@@ -10567,11 +10476,6 @@
10567
  "node": ">=4"
10568
  }
10569
  },
10570
- "node_modules/send/node_modules/ms": {
10571
- "version": "2.1.3",
10572
- "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
10573
- "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
10574
- },
10575
  "node_modules/serve-static": {
10576
  "version": "1.15.0",
10577
  "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
@@ -10876,7 +10780,6 @@
10876
  "version": "3.0.3",
10877
  "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz",
10878
  "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==",
10879
- "optional": true,
10880
  "peer": true,
10881
  "dependencies": {
10882
  "memory-pager": "^1.0.2"
@@ -10958,12 +10861,13 @@
10958
  }
10959
  },
10960
  "node_modules/streamx": {
10961
- "version": "2.16.1",
10962
- "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz",
10963
- "integrity": "sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ==",
10964
  "dependencies": {
10965
- "fast-fifo": "^1.1.0",
10966
- "queue-tick": "^1.0.1"
 
10967
  },
10968
  "optionalDependencies": {
10969
  "bare-events": "^2.2.0"
@@ -11150,11 +11054,6 @@
11150
  "url": "https://github.com/sponsors/ljharb"
11151
  }
11152
  },
11153
- "node_modules/symbol-tree": {
11154
- "version": "3.2.4",
11155
- "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
11156
- "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
11157
- },
11158
  "node_modules/tar": {
11159
  "version": "6.2.1",
11160
  "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
@@ -11172,9 +11071,9 @@
11172
  }
11173
  },
11174
  "node_modules/tar-fs": {
11175
- "version": "3.0.5",
11176
- "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.5.tgz",
11177
- "integrity": "sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg==",
11178
  "dependencies": {
11179
  "pump": "^3.0.0",
11180
  "tar-stream": "^3.1.5"
@@ -11263,6 +11162,14 @@
11263
  "url": "https://github.com/sponsors/isaacs"
11264
  }
11265
  },
 
 
 
 
 
 
 
 
11266
  "node_modules/text-table": {
11267
  "version": "0.2.0",
11268
  "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
@@ -11369,15 +11276,15 @@
11369
  }
11370
  },
11371
  "node_modules/tr46": {
11372
- "version": "3.0.0",
11373
- "resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz",
11374
- "integrity": "sha512-l7FvfAHlcmulp8kr+flpQZmVwtu7nfRV7NZujtN0OqES8EL4O4e0qqzL0DC5gAvx/ZC/9lk6rhcUwYvkBnBnYA==",
11375
  "peer": true,
11376
  "dependencies": {
11377
- "punycode": "^2.1.1"
11378
  },
11379
  "engines": {
11380
- "node": ">=12"
11381
  }
11382
  },
11383
  "node_modules/ts-deepmerge": {
@@ -11613,10 +11520,15 @@
11613
  "url": "https://github.com/sponsors/ljharb"
11614
  }
11615
  },
 
 
 
 
 
11616
  "node_modules/typescript": {
11617
- "version": "5.4.5",
11618
- "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz",
11619
- "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
11620
  "devOptional": true,
11621
  "bin": {
11622
  "tsc": "bin/tsc",
@@ -11626,6 +11538,11 @@
11626
  "node": ">=14.17"
11627
  }
11628
  },
 
 
 
 
 
11629
  "node_modules/unbox-primitive": {
11630
  "version": "1.0.2",
11631
  "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz",
@@ -11802,17 +11719,6 @@
11802
  "node": ">= 0.8"
11803
  }
11804
  },
11805
- "node_modules/w3c-xmlserializer": {
11806
- "version": "5.0.0",
11807
- "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz",
11808
- "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==",
11809
- "dependencies": {
11810
- "xml-name-validator": "^5.0.0"
11811
- },
11812
- "engines": {
11813
- "node": ">=18"
11814
- }
11815
- },
11816
  "node_modules/walker": {
11817
  "version": "1.0.8",
11818
  "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
@@ -11846,6 +11752,7 @@
11846
  "version": "7.0.0",
11847
  "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
11848
  "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
 
11849
  "engines": {
11850
  "node": ">=12"
11851
  }
@@ -11871,36 +11778,17 @@
11871
  "node": ">=0.8.0"
11872
  }
11873
  },
11874
- "node_modules/whatwg-encoding": {
11875
- "version": "3.1.1",
11876
- "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
11877
- "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
11878
- "dependencies": {
11879
- "iconv-lite": "0.6.3"
11880
- },
11881
- "engines": {
11882
- "node": ">=18"
11883
- }
11884
- },
11885
- "node_modules/whatwg-mimetype": {
11886
- "version": "4.0.0",
11887
- "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
11888
- "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
11889
- "engines": {
11890
- "node": ">=18"
11891
- }
11892
- },
11893
  "node_modules/whatwg-url": {
11894
- "version": "11.0.0",
11895
- "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
11896
- "integrity": "sha512-RKT8HExMpoYx4igMiVMY83lN6UeITKJlBQ+vR/8ZJ8OCdSiN3RwCq+9gH0+Xzj0+5IrM6i4j/6LuvzbZIQgEcQ==",
11897
  "peer": true,
11898
  "dependencies": {
11899
- "tr46": "^3.0.0",
11900
  "webidl-conversions": "^7.0.0"
11901
  },
11902
  "engines": {
11903
- "node": ">=12"
11904
  }
11905
  },
11906
  "node_modules/which": {
@@ -11996,9 +11884,9 @@
11996
  }
11997
  },
11998
  "node_modules/ws": {
11999
- "version": "8.16.0",
12000
- "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
12001
- "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
12002
  "engines": {
12003
  "node": ">=10.0.0"
12004
  },
@@ -12020,14 +11908,6 @@
12020
  "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
12021
  "integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
12022
  },
12023
- "node_modules/xml-name-validator": {
12024
- "version": "5.0.0",
12025
- "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz",
12026
- "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==",
12027
- "engines": {
12028
- "node": ">=18"
12029
- }
12030
- },
12031
  "node_modules/xml2js": {
12032
  "version": "0.5.0",
12033
  "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -12048,11 +11928,6 @@
12048
  "node": ">=4.0"
12049
  }
12050
  },
12051
- "node_modules/xmlchars": {
12052
- "version": "2.2.0",
12053
- "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
12054
- "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
12055
- },
12056
  "node_modules/y18n": {
12057
  "version": "5.0.8",
12058
  "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
@@ -12137,9 +12012,9 @@
12137
  }
12138
  },
12139
  "node_modules/zod": {
12140
- "version": "3.22.4",
12141
- "resolved": "https://registry.npmjs.org/zod/-/zod-3.22.4.tgz",
12142
- "integrity": "sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==",
12143
  "funding": {
12144
  "url": "https://github.com/sponsors/colinhacks"
12145
  }
 
14
  "archiver": "^6.0.1",
15
  "axios": "^1.3.3",
16
  "bcrypt": "^5.1.0",
17
+ "civkit": "^0.7.0-0f8889a",
18
  "core-js": "^3.37.1",
19
  "cors": "^2.8.5",
20
  "dayjs": "^1.11.9",
 
23
  "firebase-functions": "^4.9.0",
24
  "htmlparser2": "^9.0.0",
25
  "jose": "^5.1.0",
 
26
  "langdetect": "^0.2.1",
27
+ "linkedom": "^0.18.4",
28
  "maxmind": "^4.3.18",
29
  "minio": "^7.1.3",
30
  "openai": "^4.20.0",
31
  "pdfjs-dist": "^4.2.67",
32
+ "puppeteer": "^23.3.0",
33
  "puppeteer-extra": "^3.3.6",
34
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
35
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
 
48
  "@types/bcrypt": "^5.0.0",
49
  "@types/cors": "^2.8.17",
50
  "@types/generic-pool": "^3.8.1",
51
+ "@types/node": "^20.14.13",
52
  "@types/set-cookie-parser": "^2.4.7",
53
  "@typescript-eslint/eslint-plugin": "^5.12.0",
54
  "@typescript-eslint/parser": "^5.12.0",
 
57
  "eslint-plugin-import": "^2.25.4",
58
  "firebase-functions-test": "^3.0.0",
59
  "replicate": "^0.16.1",
60
+ "typescript": "^5.5.4"
61
  },
62
  "engines": {
63
  "node": "20"
 
1564
  }
1565
  },
1566
  "node_modules/@mongodb-js/saslprep": {
1567
+ "version": "1.1.9",
1568
+ "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.1.9.tgz",
1569
+ "integrity": "sha512-tVkljjeEaAhCqTzajSdgbQ6gE6f3oneVwa3iXR6csiEwXXOFsiC6Uh9iAjAhXPtqa/XMDHWjjeNH/77m/Yq2dw==",
 
1570
  "peer": true,
1571
  "dependencies": {
1572
  "sparse-bitfield": "^3.0.3"
 
1976
  "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
1977
  },
1978
  "node_modules/@puppeteer/browsers": {
1979
+ "version": "2.4.0",
1980
+ "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.4.0.tgz",
1981
+ "integrity": "sha512-x8J1csfIygOwf6D6qUAZ0ASk3z63zPb7wkNeHRerCMh82qWKUrOgkuP005AJC8lDL6/evtXETGEJVcwykKT4/g==",
1982
+ "dependencies": {
1983
+ "debug": "^4.3.6",
1984
+ "extract-zip": "^2.0.1",
1985
+ "progress": "^2.0.3",
1986
+ "proxy-agent": "^6.4.0",
1987
+ "semver": "^7.6.3",
1988
+ "tar-fs": "^3.0.6",
1989
+ "unbzip2-stream": "^1.4.3",
1990
+ "yargs": "^17.7.2"
1991
  },
1992
  "bin": {
1993
  "browsers": "lib/cjs/main-cli.js"
 
2298
  "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
2299
  },
2300
  "node_modules/@types/node": {
2301
+ "version": "20.14.13",
2302
+ "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.13.tgz",
2303
+ "integrity": "sha512-+bHoGiZb8UiQ0+WEtmph2IWQCjIqg8MDZMAV+ppRRhUZnquF5mQkP/9vpSwJClEiSM/C7fZZExPzfU0vJTyp8w==",
2304
  "dependencies": {
2305
  "undici-types": "~5.26.4"
2306
  }
 
2423
  "peer": true
2424
  },
2425
  "node_modules/@types/whatwg-url": {
2426
+ "version": "11.0.5",
2427
+ "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz",
2428
+ "integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==",
2429
  "peer": true,
2430
  "dependencies": {
 
2431
  "@types/webidl-conversions": "*"
2432
  }
2433
  },
 
3225
  "optional": true
3226
  },
3227
  "node_modules/bare-fs": {
3228
+ "version": "2.3.3",
3229
+ "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-2.3.3.tgz",
3230
+ "integrity": "sha512-7RYKL+vZVCyAsMLi5SPu7QGauGGT8avnP/HO571ndEuV4MYdGXvLhtW67FuLPeEI8EiIY7zbbRR9x7x7HU0kgw==",
3231
  "optional": true,
3232
  "dependencies": {
3233
  "bare-events": "^2.0.0",
3234
  "bare-path": "^2.0.0",
3235
+ "bare-stream": "^2.0.0"
3236
  }
3237
  },
3238
  "node_modules/bare-os": {
3239
+ "version": "2.4.2",
3240
+ "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.4.2.tgz",
3241
+ "integrity": "sha512-HZoJwzC+rZ9lqEemTMiO0luOePoGYNBgsLLgegKR/cljiJvcDNhDZQkzC+NC5Oh0aHbdBNSOHpghwMuB5tqhjg==",
3242
  "optional": true
3243
  },
3244
  "node_modules/bare-path": {
3245
+ "version": "2.1.3",
3246
+ "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-2.1.3.tgz",
3247
+ "integrity": "sha512-lh/eITfU8hrj9Ru5quUp0Io1kJWIk1bTjzo7JH1P5dWmQ2EL4hFUlfI8FonAhSlgIfhn63p84CDY/x+PisgcXA==",
3248
  "optional": true,
3249
  "dependencies": {
3250
  "bare-os": "^2.1.0"
3251
  }
3252
  },
3253
+ "node_modules/bare-stream": {
3254
+ "version": "2.2.1",
3255
+ "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.2.1.tgz",
3256
+ "integrity": "sha512-YTB47kHwBW9zSG8LD77MIBAAQXjU2WjAkMHeeb7hUplVs6+IoM5I7uEVQNPMB7lj9r8I76UMdoMkGnCodHOLqg==",
3257
+ "optional": true,
3258
+ "dependencies": {
3259
+ "b4a": "^1.6.6",
3260
+ "streamx": "^2.18.0"
3261
+ }
3262
+ },
3263
  "node_modules/base32.js": {
3264
  "version": "0.1.0",
3265
  "resolved": "https://registry.npmjs.org/base32.js/-/base32.js-0.1.0.tgz",
 
3382
  "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
3383
  "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
3384
  },
3385
+ "node_modules/boolbase": {
3386
+ "version": "1.0.0",
3387
+ "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
3388
+ "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
3389
+ },
3390
  "node_modules/brace-expansion": {
3391
  "version": "1.1.11",
3392
  "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
 
3457
  }
3458
  },
3459
  "node_modules/bson": {
3460
+ "version": "6.8.0",
3461
+ "resolved": "https://registry.npmjs.org/bson/-/bson-6.8.0.tgz",
3462
+ "integrity": "sha512-iOJg8pr7wq2tg/zSlCCHMi3hMm5JTOxLTagf3zxhcenHsFp+c6uOs6K7W5UE7A4QIJGtqh/ZovFNMP4mOPJynQ==",
3463
  "peer": true,
3464
  "engines": {
3465
+ "node": ">=16.20.1"
3466
  }
3467
  },
3468
  "node_modules/buffer": {
 
3672
  }
3673
  },
3674
  "node_modules/chromium-bidi": {
3675
+ "version": "0.6.5",
3676
+ "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.6.5.tgz",
3677
+ "integrity": "sha512-RuLrmzYrxSb0s9SgpB+QN5jJucPduZQ/9SIe76MDxYJuecPW5mxMdacJ1f4EtgiV+R0p3sCkznTMvH0MPGFqjA==",
3678
  "dependencies": {
3679
  "mitt": "3.0.1",
3680
  "urlpattern-polyfill": "10.0.0",
3681
+ "zod": "3.23.8"
3682
  },
3683
  "peerDependencies": {
3684
  "devtools-protocol": "*"
 
3701
  }
3702
  },
3703
  "node_modules/civkit": {
3704
+ "version": "0.7.0-0f8889a",
3705
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.7.0-0f8889a.tgz",
3706
+ "integrity": "sha512-T14Jk3loghFqluWUUQmuEn0hO2pwWw+tsfdG4++NPqvS2W/lclZoA1EyBIZ8Uk0MYqEp02O6BwwbAoq+g++hMw==",
3707
  "dependencies": {
3708
  "lodash": "^4.17.21",
3709
  "tslib": "^2.5.0"
 
3732
  "pino": "^8.11.0",
3733
  "reflect-metadata": "^0.1.13",
3734
  "smtp-server": "^3.11.0",
3735
+ "tld-extract": "^2.1.0",
3736
+ "zod": "*",
3737
+ "zod-openai": "*"
3738
  },
3739
  "peerDependencies": {
3740
+ "mongodb": "^6",
3741
+ "tsyringe": "^4"
3742
  }
3743
  },
3744
  "node_modules/cjs-module-lexer": {
 
4064
  "node": ">= 8"
4065
  }
4066
  },
4067
+ "node_modules/css-select": {
4068
+ "version": "5.1.0",
4069
+ "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz",
4070
+ "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==",
4071
  "dependencies": {
4072
+ "boolbase": "^1.0.0",
4073
+ "css-what": "^6.1.0",
4074
+ "domhandler": "^5.0.2",
4075
+ "domutils": "^3.0.1",
4076
+ "nth-check": "^2.0.1"
4077
  },
4078
+ "funding": {
4079
+ "url": "https://github.com/sponsors/fb55"
4080
+ }
4081
+ },
4082
+ "node_modules/css-what": {
4083
+ "version": "6.1.0",
4084
+ "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz",
4085
+ "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==",
4086
  "engines": {
4087
+ "node": ">= 6"
4088
+ },
4089
+ "funding": {
4090
+ "url": "https://github.com/sponsors/fb55"
4091
  }
4092
  },
4093
+ "node_modules/cssom": {
4094
+ "version": "0.5.0",
4095
+ "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz",
4096
+ "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="
4097
+ },
4098
  "node_modules/data-uri-to-buffer": {
4099
  "version": "6.0.2",
4100
  "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
 
4103
  "node": ">= 14"
4104
  }
4105
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4106
  "node_modules/data-view-buffer": {
4107
  "version": "1.0.1",
4108
  "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
 
4160
  "integrity": "sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ=="
4161
  },
4162
  "node_modules/debug": {
4163
+ "version": "4.3.7",
4164
+ "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
4165
+ "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
4166
  "dependencies": {
4167
+ "ms": "^2.1.3"
4168
  },
4169
  "engines": {
4170
  "node": ">=6.0"
 
4175
  }
4176
  }
4177
  },
 
 
 
 
 
4178
  "node_modules/decode-uri-component": {
4179
  "version": "0.2.2",
4180
  "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
 
4354
  }
4355
  },
4356
  "node_modules/devtools-protocol": {
4357
+ "version": "0.0.1330662",
4358
+ "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1330662.tgz",
4359
+ "integrity": "sha512-pzh6YQ8zZfz3iKlCvgzVCu22NdpZ8hNmwU6WnQjNVquh0A9iVosPtNLWDwaWVGyrntQlltPFztTMK5Cg6lfCuw=="
4360
  },
4361
  "node_modules/diff-sequences": {
4362
  "version": "29.6.3",
 
5456
  "@google-cloud/storage": "^7.7.0"
5457
  }
5458
  },
 
 
 
 
 
 
 
 
5459
  "node_modules/firebase-functions": {
5460
  "version": "4.9.0",
5461
  "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-4.9.0.tgz",
 
5780
  }
5781
  },
5782
  "node_modules/gcp-metadata": {
5783
+ "version": "5.3.0",
5784
+ "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-5.3.0.tgz",
5785
+ "integrity": "sha512-FNTkdNEnBdlqF2oatizolQqNANMrcqJt6AAYt99B3y1aLLC8Hc5IOBb+ZnnzllodEEf6xMBp6wRcBbc16fa65w==",
5786
+ "optional": true,
5787
+ "peer": true,
5788
  "dependencies": {
5789
+ "gaxios": "^5.0.0",
5790
  "json-bigint": "^1.0.0"
5791
  },
5792
  "engines": {
5793
+ "node": ">=12"
5794
+ }
5795
+ },
5796
+ "node_modules/gcp-metadata/node_modules/gaxios": {
5797
+ "version": "5.1.3",
5798
+ "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-5.1.3.tgz",
5799
+ "integrity": "sha512-95hVgBRgEIRQQQHIbnxBXeHbW4TqFk4ZDJW7wmVtvYar72FdhRIo1UGOLS2eRAKCPEdPBWu+M7+A33D9CdX9rA==",
5800
+ "optional": true,
5801
+ "peer": true,
5802
+ "dependencies": {
5803
+ "extend": "^3.0.2",
5804
+ "https-proxy-agent": "^5.0.0",
5805
+ "is-stream": "^2.0.0",
5806
+ "node-fetch": "^2.6.9"
5807
+ },
5808
+ "engines": {
5809
+ "node": ">=12"
5810
  }
5811
  },
5812
  "node_modules/generic-pool": {
 
6028
  "node": ">=14"
6029
  }
6030
  },
6031
+ "node_modules/google-auth-library/node_modules/gcp-metadata": {
6032
+ "version": "6.1.0",
6033
+ "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.0.tgz",
6034
+ "integrity": "sha512-Jh/AIwwgaxan+7ZUUmRLCjtchyDiqh4KjBJ5tW3plBZb5iL/BPcso8A5DlzeD9qlw0duCamnNdpFjxwaT0KyKg==",
6035
+ "dependencies": {
6036
+ "gaxios": "^6.0.0",
6037
+ "json-bigint": "^1.0.0"
6038
+ },
6039
+ "engines": {
6040
+ "node": ">=14"
6041
+ }
6042
+ },
6043
  "node_modules/google-gax": {
6044
  "version": "4.3.2",
6045
  "resolved": "https://registry.npmjs.org/google-gax/-/google-gax-4.3.2.tgz",
 
6201
  "node": ">= 0.4"
6202
  }
6203
  },
 
 
 
 
 
 
 
 
 
 
 
6204
  "node_modules/html-escaper": {
6205
  "version": "2.0.2",
6206
  "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
 
6389
  "version": "0.6.3",
6390
  "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
6391
  "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
6392
+ "optional": true,
6393
  "dependencies": {
6394
  "safer-buffer": ">= 2.1.2 < 3.0.0"
6395
  },
 
6787
  "node": ">=0.10.0"
6788
  }
6789
  },
 
 
 
 
 
6790
  "node_modules/is-regex": {
6791
  "version": "1.1.4",
6792
  "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
 
7655
  "node": ">=0.1.90"
7656
  }
7657
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7658
  "node_modules/jsesc": {
7659
  "version": "2.5.2",
7660
  "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
 
8073
  "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
8074
  "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg=="
8075
  },
8076
+ "node_modules/linkedom": {
8077
+ "version": "0.18.4",
8078
+ "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.4.tgz",
8079
+ "integrity": "sha512-JhLErxMIEOKByMi3fURXgI1fYOzR87L1Cn0+MI9GlMckFrqFZpV1SUGox1jcKtsKN3y6JgclcQf0FzZT//BuGw==",
8080
+ "dependencies": {
8081
+ "css-select": "^5.1.0",
8082
+ "cssom": "^0.5.0",
8083
+ "html-escaper": "^3.0.3",
8084
+ "htmlparser2": "^9.1.0",
8085
+ "uhyphen": "^0.2.0"
8086
+ }
8087
+ },
8088
+ "node_modules/linkedom/node_modules/html-escaper": {
8089
+ "version": "3.0.3",
8090
+ "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz",
8091
+ "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ=="
8092
+ },
8093
  "node_modules/locate-path": {
8094
  "version": "6.0.0",
8095
  "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
 
8269
  "version": "1.5.0",
8270
  "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz",
8271
  "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==",
 
8272
  "peer": true
8273
  },
8274
  "node_modules/merge-deep": {
 
8497
  }
8498
  },
8499
  "node_modules/mongodb": {
8500
+ "version": "6.8.0",
8501
+ "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.8.0.tgz",
8502
+ "integrity": "sha512-HGQ9NWDle5WvwMnrvUxsFYPd3JEbqD3RgABHBQRuoCEND0qzhsd0iH5ypHsf1eJ+sXmvmyKpP+FLOKY8Il7jMw==",
8503
  "peer": true,
8504
  "dependencies": {
8505
+ "@mongodb-js/saslprep": "^1.1.5",
8506
+ "bson": "^6.7.0",
8507
+ "mongodb-connection-string-url": "^3.0.0"
8508
  },
8509
  "engines": {
8510
+ "node": ">=16.20.1"
 
 
 
8511
  },
8512
  "peerDependencies": {
8513
  "@aws-sdk/credential-providers": "^3.188.0",
8514
+ "@mongodb-js/zstd": "^1.1.0",
8515
+ "gcp-metadata": "^5.2.0",
8516
+ "kerberos": "^2.0.1",
8517
+ "mongodb-client-encryption": ">=6.0.0 <7",
8518
+ "snappy": "^7.2.2",
8519
+ "socks": "^2.7.1"
8520
  },
8521
  "peerDependenciesMeta": {
8522
  "@aws-sdk/credential-providers": {
 
8525
  "@mongodb-js/zstd": {
8526
  "optional": true
8527
  },
8528
+ "gcp-metadata": {
8529
+ "optional": true
8530
+ },
8531
  "kerberos": {
8532
  "optional": true
8533
  },
 
8536
  },
8537
  "snappy": {
8538
  "optional": true
8539
+ },
8540
+ "socks": {
8541
+ "optional": true
8542
  }
8543
  }
8544
  },
8545
  "node_modules/mongodb-connection-string-url": {
8546
+ "version": "3.0.1",
8547
+ "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.1.tgz",
8548
+ "integrity": "sha512-XqMGwRX0Lgn05TDB4PyG2h2kKO/FfWJyCzYQbIhXUxz7ETt0I/FqHjUeqj37irJ+Dl1ZtU82uYyj14u2XsZKfg==",
8549
  "peer": true,
8550
  "dependencies": {
8551
+ "@types/whatwg-url": "^11.0.2",
8552
+ "whatwg-url": "^13.0.0"
8553
  }
8554
  },
8555
  "node_modules/ms": {
8556
+ "version": "2.1.3",
8557
+ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
8558
+ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
8559
  },
8560
  "node_modules/napi-build-utils": {
8561
  "version": "1.0.2",
 
8774
  "set-blocking": "^2.0.0"
8775
  }
8776
  },
8777
+ "node_modules/nth-check": {
8778
+ "version": "2.1.1",
8779
+ "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
8780
+ "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
8781
+ "dependencies": {
8782
+ "boolbase": "^1.0.0"
8783
+ },
8784
+ "funding": {
8785
+ "url": "https://github.com/fb55/nth-check?sponsor=1"
8786
+ }
8787
  },
8788
  "node_modules/object-assign": {
8789
  "version": "4.1.1",
 
8953
  "openai": "bin/cli"
8954
  }
8955
  },
8956
+ "node_modules/openai/node_modules/@types/node": {
8957
+ "version": "18.19.42",
8958
+ "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.42.tgz",
8959
+ "integrity": "sha512-d2ZFc/3lnK2YCYhos8iaNIYu9Vfhr92nHiyJHRltXWjXUBjEE+A4I58Tdbnw4VhggSW+2j5y5gTrLs4biNnubg==",
8960
+ "dependencies": {
8961
+ "undici-types": "~5.26.4"
8962
+ }
8963
+ },
8964
  "node_modules/optionator": {
8965
  "version": "0.9.3",
8966
  "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
 
9107
  "url": "https://github.com/sponsors/sindresorhus"
9108
  }
9109
  },
 
 
 
 
 
 
 
 
 
 
 
9110
  "node_modules/parseurl": {
9111
  "version": "1.3.3",
9112
  "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
 
9632
  }
9633
  },
9634
  "node_modules/puppeteer": {
9635
+ "version": "23.3.0",
9636
+ "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-23.3.0.tgz",
9637
+ "integrity": "sha512-e2jY8cdWSUGsrLxqGm3hIbJq/UIk1uOY8XY7SM51leXkH7shrIyE91lK90Q9byX6tte+cyL3HKqlWBEd6TjWTA==",
9638
  "hasInstallScript": true,
9639
  "dependencies": {
9640
+ "@puppeteer/browsers": "2.4.0",
9641
+ "chromium-bidi": "0.6.5",
9642
+ "cosmiconfig": "^9.0.0",
9643
+ "devtools-protocol": "0.0.1330662",
9644
+ "puppeteer-core": "23.3.0",
9645
+ "typed-query-selector": "^2.12.0"
9646
  },
9647
  "bin": {
9648
+ "puppeteer": "lib/cjs/puppeteer/node/cli.js"
9649
  },
9650
  "engines": {
9651
  "node": ">=18"
9652
  }
9653
  },
9654
  "node_modules/puppeteer-core": {
9655
+ "version": "23.3.0",
9656
+ "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-23.3.0.tgz",
9657
+ "integrity": "sha512-sB2SsVMFs4gKad5OCdv6w5vocvtEUrRl0zQqSyRPbo/cj1Ktbarmhxy02Zyb9R9HrssBcJDZbkrvBnbaesPyYg==",
9658
  "dependencies": {
9659
+ "@puppeteer/browsers": "2.4.0",
9660
+ "chromium-bidi": "0.6.5",
9661
+ "debug": "^4.3.6",
9662
+ "devtools-protocol": "0.0.1330662",
9663
+ "typed-query-selector": "^2.12.0",
9664
+ "ws": "^8.18.0"
9665
  },
9666
  "engines": {
9667
  "node": ">=18"
 
10322
  "url": "https://github.com/sponsors/isaacs"
10323
  }
10324
  },
 
 
 
 
 
10325
  "node_modules/run-parallel": {
10326
  "version": "1.2.0",
10327
  "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
 
10418
  "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
10419
  "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
10420
  },
 
 
 
 
 
 
 
 
 
 
 
10421
  "node_modules/semver": {
10422
+ "version": "7.6.3",
10423
+ "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz",
10424
+ "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==",
 
 
 
10425
  "bin": {
10426
  "semver": "bin/semver.js"
10427
  },
 
10429
  "node": ">=10"
10430
  }
10431
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10432
  "node_modules/send": {
10433
  "version": "0.18.0",
10434
  "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
 
10476
  "node": ">=4"
10477
  }
10478
  },
 
 
 
 
 
10479
  "node_modules/serve-static": {
10480
  "version": "1.15.0",
10481
  "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
 
10780
  "version": "3.0.3",
10781
  "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz",
10782
  "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==",
 
10783
  "peer": true,
10784
  "dependencies": {
10785
  "memory-pager": "^1.0.2"
 
10861
  }
10862
  },
10863
  "node_modules/streamx": {
10864
+ "version": "2.20.0",
10865
+ "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.20.0.tgz",
10866
+ "integrity": "sha512-ZGd1LhDeGFucr1CUCTBOS58ZhEendd0ttpGT3usTvosS4ntIwKN9LJFp+OeCSprsCPL14BXVRZlHGRY1V9PVzQ==",
10867
  "dependencies": {
10868
+ "fast-fifo": "^1.3.2",
10869
+ "queue-tick": "^1.0.1",
10870
+ "text-decoder": "^1.1.0"
10871
  },
10872
  "optionalDependencies": {
10873
  "bare-events": "^2.2.0"
 
11054
  "url": "https://github.com/sponsors/ljharb"
11055
  }
11056
  },
 
 
 
 
 
11057
  "node_modules/tar": {
11058
  "version": "6.2.1",
11059
  "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
 
11071
  }
11072
  },
11073
  "node_modules/tar-fs": {
11074
+ "version": "3.0.6",
11075
+ "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.6.tgz",
11076
+ "integrity": "sha512-iokBDQQkUyeXhgPYaZxmczGPhnhXZ0CmrqI+MOb/WFGS9DW5wnfrLgtjUJBvz50vQ3qfRwJ62QVoCFu8mPVu5w==",
11077
  "dependencies": {
11078
  "pump": "^3.0.0",
11079
  "tar-stream": "^3.1.5"
 
11162
  "url": "https://github.com/sponsors/isaacs"
11163
  }
11164
  },
11165
+ "node_modules/text-decoder": {
11166
+ "version": "1.1.1",
11167
+ "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.1.1.tgz",
11168
+ "integrity": "sha512-8zll7REEv4GDD3x4/0pW+ppIxSNs7H1J10IKFZsuOMscumCdM2a+toDGLPA3T+1+fLBql4zbt5z83GEQGGV5VA==",
11169
+ "dependencies": {
11170
+ "b4a": "^1.6.4"
11171
+ }
11172
+ },
11173
  "node_modules/text-table": {
11174
  "version": "0.2.0",
11175
  "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
 
11276
  }
11277
  },
11278
  "node_modules/tr46": {
11279
+ "version": "4.1.1",
11280
+ "resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz",
11281
+ "integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==",
11282
  "peer": true,
11283
  "dependencies": {
11284
+ "punycode": "^2.3.0"
11285
  },
11286
  "engines": {
11287
+ "node": ">=14"
11288
  }
11289
  },
11290
  "node_modules/ts-deepmerge": {
 
11520
  "url": "https://github.com/sponsors/ljharb"
11521
  }
11522
  },
11523
+ "node_modules/typed-query-selector": {
11524
+ "version": "2.12.0",
11525
+ "resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
11526
+ "integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg=="
11527
+ },
11528
  "node_modules/typescript": {
11529
+ "version": "5.5.4",
11530
+ "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
11531
+ "integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
11532
  "devOptional": true,
11533
  "bin": {
11534
  "tsc": "bin/tsc",
 
11538
  "node": ">=14.17"
11539
  }
11540
  },
11541
+ "node_modules/uhyphen": {
11542
+ "version": "0.2.0",
11543
+ "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz",
11544
+ "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="
11545
+ },
11546
  "node_modules/unbox-primitive": {
11547
  "version": "1.0.2",
11548
  "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz",
 
11719
  "node": ">= 0.8"
11720
  }
11721
  },
 
 
 
 
 
 
 
 
 
 
 
11722
  "node_modules/walker": {
11723
  "version": "1.0.8",
11724
  "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
 
11752
  "version": "7.0.0",
11753
  "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
11754
  "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
11755
+ "peer": true,
11756
  "engines": {
11757
  "node": ">=12"
11758
  }
 
11778
  "node": ">=0.8.0"
11779
  }
11780
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11781
  "node_modules/whatwg-url": {
11782
+ "version": "13.0.0",
11783
+ "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-13.0.0.tgz",
11784
+ "integrity": "sha512-9WWbymnqj57+XEuqADHrCJ2eSXzn8WXIW/YSGaZtb2WKAInQ6CHfaUUcTyyver0p8BDg5StLQq8h1vtZuwmOig==",
11785
  "peer": true,
11786
  "dependencies": {
11787
+ "tr46": "^4.1.1",
11788
  "webidl-conversions": "^7.0.0"
11789
  },
11790
  "engines": {
11791
+ "node": ">=16"
11792
  }
11793
  },
11794
  "node_modules/which": {
 
11884
  }
11885
  },
11886
  "node_modules/ws": {
11887
+ "version": "8.18.0",
11888
+ "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz",
11889
+ "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==",
11890
  "engines": {
11891
  "node": ">=10.0.0"
11892
  },
 
11908
  "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
11909
  "integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
11910
  },
 
 
 
 
 
 
 
 
11911
  "node_modules/xml2js": {
11912
  "version": "0.5.0",
11913
  "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
 
11928
  "node": ">=4.0"
11929
  }
11930
  },
 
 
 
 
 
11931
  "node_modules/y18n": {
11932
  "version": "5.0.8",
11933
  "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
 
12012
  }
12013
  },
12014
  "node_modules/zod": {
12015
+ "version": "3.23.8",
12016
+ "resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz",
12017
+ "integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==",
12018
  "funding": {
12019
  "url": "https://github.com/sponsors/colinhacks"
12020
  }
backend/functions/package.json CHANGED
@@ -34,7 +34,7 @@
34
  "archiver": "^6.0.1",
35
  "axios": "^1.3.3",
36
  "bcrypt": "^5.1.0",
37
- "civkit": "^0.6.5-047c0d8",
38
  "core-js": "^3.37.1",
39
  "cors": "^2.8.5",
40
  "dayjs": "^1.11.9",
@@ -43,13 +43,13 @@
43
  "firebase-functions": "^4.9.0",
44
  "htmlparser2": "^9.0.0",
45
  "jose": "^5.1.0",
46
- "jsdom": "^24.0.0",
47
  "langdetect": "^0.2.1",
 
48
  "maxmind": "^4.3.18",
49
  "minio": "^7.1.3",
50
  "openai": "^4.20.0",
51
  "pdfjs-dist": "^4.2.67",
52
- "puppeteer": "^22.7.1",
53
  "puppeteer-extra": "^3.3.6",
54
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
55
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
@@ -68,7 +68,7 @@
68
  "@types/bcrypt": "^5.0.0",
69
  "@types/cors": "^2.8.17",
70
  "@types/generic-pool": "^3.8.1",
71
- "@types/node": "^18",
72
  "@types/set-cookie-parser": "^2.4.7",
73
  "@typescript-eslint/eslint-plugin": "^5.12.0",
74
  "@typescript-eslint/parser": "^5.12.0",
@@ -77,7 +77,7 @@
77
  "eslint-plugin-import": "^2.25.4",
78
  "firebase-functions-test": "^3.0.0",
79
  "replicate": "^0.16.1",
80
- "typescript": "^5.1.6"
81
  },
82
  "private": true,
83
  "exports": {
 
34
  "archiver": "^6.0.1",
35
  "axios": "^1.3.3",
36
  "bcrypt": "^5.1.0",
37
+ "civkit": "^0.7.0-0f8889a",
38
  "core-js": "^3.37.1",
39
  "cors": "^2.8.5",
40
  "dayjs": "^1.11.9",
 
43
  "firebase-functions": "^4.9.0",
44
  "htmlparser2": "^9.0.0",
45
  "jose": "^5.1.0",
 
46
  "langdetect": "^0.2.1",
47
+ "linkedom": "^0.18.4",
48
  "maxmind": "^4.3.18",
49
  "minio": "^7.1.3",
50
  "openai": "^4.20.0",
51
  "pdfjs-dist": "^4.2.67",
52
+ "puppeteer": "^23.3.0",
53
  "puppeteer-extra": "^3.3.6",
54
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
55
  "puppeteer-extra-plugin-page-proxy": "^2.0.0",
 
68
  "@types/bcrypt": "^5.0.0",
69
  "@types/cors": "^2.8.17",
70
  "@types/generic-pool": "^3.8.1",
71
+ "@types/node": "^20.14.13",
72
  "@types/set-cookie-parser": "^2.4.7",
73
  "@typescript-eslint/eslint-plugin": "^5.12.0",
74
  "@typescript-eslint/parser": "^5.12.0",
 
77
  "eslint-plugin-import": "^2.25.4",
78
  "firebase-functions-test": "^3.0.0",
79
  "replicate": "^0.16.1",
80
+ "typescript": "^5.5.4"
81
  },
82
  "private": true,
83
  "exports": {
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -1,7 +1,6 @@
1
  import {
2
  assignTransferProtocolMeta, marshalErrorLike,
3
  RPCHost, RPCReflection,
4
- HashManager,
5
  AssertionFailureError, ParamValidationError, Defer,
6
  } from 'civkit';
7
  import { singleton } from 'tsyringe';
@@ -11,22 +10,17 @@ import _ from 'lodash';
11
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
12
  import { Request, Response } from 'express';
13
  const pNormalizeUrl = import("@esm2cjs/normalize-url");
14
- import { AltTextService } from '../services/alt-text';
15
- import TurndownService from 'turndown';
16
  import { Crawled } from '../db/crawled';
17
- import { cleanAttribute } from '../utils/misc';
18
  import { randomUUID } from 'crypto';
19
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
20
 
21
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
22
  import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
23
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
24
- import { PDFExtractor } from '../services/pdf-extract';
25
  import { DomainBlockade } from '../db/domain-blockade';
26
  import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
27
  import { JSDomControl } from '../services/jsdom';
28
-
29
- const md5Hasher = new HashManager('md5', 'hex');
30
 
31
  export interface ExtraScrappingOptions extends ScrappingOptions {
32
  withIframe?: boolean;
@@ -35,29 +29,6 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
35
  keepImgDataUrl?: boolean;
36
  }
37
 
38
- export interface FormattedPage {
39
- title?: string;
40
- description?: string;
41
- url?: string;
42
- content?: string;
43
- publishedTime?: string;
44
- html?: string;
45
- text?: string;
46
- screenshotUrl?: string;
47
- screenshot?: Buffer;
48
- pageshotUrl?: string;
49
- pageshot?: Buffer;
50
- links?: { [k: string]: string; };
51
- images?: { [k: string]: string; };
52
- usage?: {
53
- total_tokens?: number;
54
- totalTokens?: number;
55
- tokens?: number;
56
- };
57
-
58
- toString: () => string;
59
- }
60
-
61
  const indexProto = {
62
  toString: function (): string {
63
  return _(this)
@@ -72,8 +43,6 @@ const indexProto = {
72
  export class CrawlerHost extends RPCHost {
73
  logger = this.globalLogger.child({ service: this.constructor.name });
74
 
75
- turnDownPlugins = [require('turndown-plugin-gfm').tables];
76
-
77
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
78
  cacheValidMs = 1000 * 3600;
79
  urlValidMs = 1000 * 3600 * 4;
@@ -83,8 +52,7 @@ export class CrawlerHost extends RPCHost {
83
  protected globalLogger: Logger,
84
  protected puppeteerControl: PuppeteerControl,
85
  protected jsdomControl: JSDomControl,
86
- protected altTextService: AltTextService,
87
- protected pdfExtractor: PDFExtractor,
88
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
89
  protected rateLimitControl: RateLimitControl,
90
  protected threadLocal: AsyncContext,
@@ -148,448 +116,6 @@ export class CrawlerHost extends RPCHost {
148
  return indexObject;
149
  }
150
 
151
- getTurndown(options?: {
152
- noRules?: boolean | string,
153
- url?: string | URL;
154
- imgDataUrlToObjectUrl?: boolean;
155
- }) {
156
- const turnDownService = new TurndownService({
157
- codeBlockStyle: 'fenced',
158
- preformattedCode: true,
159
- } as any);
160
- if (!options?.noRules) {
161
- turnDownService.addRule('remove-irrelevant', {
162
- filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
163
- replacement: () => ''
164
- });
165
- turnDownService.addRule('truncate-svg', {
166
- filter: 'svg' as any,
167
- replacement: () => ''
168
- });
169
- turnDownService.addRule('title-as-h1', {
170
- filter: ['title'],
171
- replacement: (innerText) => `${innerText}\n===============\n`
172
- });
173
- }
174
-
175
- if (options?.imgDataUrlToObjectUrl) {
176
- turnDownService.addRule('data-url-to-pseudo-object-url', {
177
- filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
178
- replacement: (_content, node: any) => {
179
- const src = (node.getAttribute('src') || '').trim();
180
- const alt = cleanAttribute(node.getAttribute('alt')) || '';
181
-
182
- if (options.url) {
183
- const refUrl = new URL(options.url);
184
- const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
185
-
186
- return `![${alt}](${mappedUrl})`;
187
- }
188
-
189
- return `![${alt}](blob:${md5Hasher.hash(src)})`;
190
- }
191
- });
192
- }
193
-
194
- turnDownService.addRule('improved-paragraph', {
195
- filter: 'p',
196
- replacement: (innerText) => {
197
- const trimmed = innerText.trim();
198
- if (!trimmed) {
199
- return '';
200
- }
201
-
202
- return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
203
- }
204
- });
205
- turnDownService.addRule('improved-inline-link', {
206
- filter: function (node, options) {
207
- return Boolean(
208
- options.linkStyle === 'inlined' &&
209
- node.nodeName === 'A' &&
210
- node.getAttribute('href')
211
- );
212
- },
213
-
214
- replacement: function (content, node: any) {
215
- let href = node.getAttribute('href');
216
- if (href) href = href.replace(/([()])/g, '\\$1');
217
- let title = cleanAttribute(node.getAttribute('title'));
218
- if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
219
-
220
- const fixedContent = content.replace(/\s+/g, ' ').trim();
221
- let fixedHref = href.replace(/\s+/g, '').trim();
222
- if (options?.url) {
223
- try {
224
- fixedHref = new URL(fixedHref, options.url).toString();
225
- } catch (_err) {
226
- void 0;
227
- }
228
- }
229
-
230
- return `[${fixedContent}](${fixedHref}${title || ''})`;
231
- }
232
- });
233
- turnDownService.addRule('improved-code', {
234
- filter: function (node: any) {
235
- let hasSiblings = node.previousSibling || node.nextSibling;
236
- let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
237
-
238
- return node.nodeName === 'CODE' && !isCodeBlock;
239
- },
240
-
241
- replacement: function (inputContent: any) {
242
- if (!inputContent) return '';
243
- let content = inputContent;
244
-
245
- let delimiter = '`';
246
- let matches = content.match(/`+/gm) || [];
247
- while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
248
- if (content.includes('\n')) {
249
- delimiter = '```';
250
- }
251
-
252
- let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
253
-
254
- return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
255
- }
256
- });
257
-
258
- return turnDownService;
259
- }
260
-
261
- getGeneralSnapshotMixins(snapshot: PageSnapshot) {
262
- let inferred;
263
- const mixin: any = {};
264
- if (this.threadLocal.get('withImagesSummary')) {
265
- inferred ??= this.jsdomControl.inferSnapshot(snapshot);
266
- const imageSummary = {} as { [k: string]: string; };
267
- const imageIdxTrack = new Map<string, number[]>();
268
-
269
- let imgIdx = 0;
270
-
271
- for (const img of inferred.imgs) {
272
- const imgSerial = ++imgIdx;
273
- const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
274
- idxArr.push(imgSerial);
275
- imageIdxTrack.set(img.src, idxArr);
276
- imageSummary[img.src] = img.alt || '';
277
- }
278
-
279
- mixin.images =
280
- _(imageSummary)
281
- .toPairs()
282
- .map(
283
- ([url, alt], i) => {
284
- return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
285
- }
286
- ).fromPairs()
287
- .value();
288
- }
289
- if (this.threadLocal.get('withLinksSummary')) {
290
- inferred ??= this.jsdomControl.inferSnapshot(snapshot);
291
- mixin.links = _.invert(inferred.links || {});
292
- }
293
-
294
- return mixin;
295
- }
296
-
297
- async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
298
- screenshotUrl?: string;
299
- pageshotUrl?: string;
300
- }, nominalUrl?: URL) {
301
- if (mode === 'screenshot') {
302
- if (snapshot.screenshot && !snapshot.screenshotUrl) {
303
- const fid = `instant-screenshots/${randomUUID()}`;
304
- await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
305
- metadata: {
306
- contentType: 'image/png',
307
- }
308
- });
309
- snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
310
- }
311
-
312
- return {
313
- ...this.getGeneralSnapshotMixins(snapshot),
314
- // html: snapshot.html,
315
- screenshotUrl: snapshot.screenshotUrl,
316
- toString() {
317
- return this.screenshotUrl;
318
- }
319
- } as FormattedPage;
320
- }
321
- if (mode === 'pageshot') {
322
- if (snapshot.pageshot && !snapshot.pageshotUrl) {
323
- const fid = `instant-screenshots/${randomUUID()}`;
324
- await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
325
- metadata: {
326
- contentType: 'image/png',
327
- }
328
- });
329
- snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
330
- }
331
-
332
- return {
333
- ...this.getGeneralSnapshotMixins(snapshot),
334
- html: snapshot.html,
335
- pageshotUrl: snapshot.pageshotUrl,
336
- toString() {
337
- return this.pageshotUrl;
338
- }
339
- } as FormattedPage;
340
- }
341
- if (mode === 'html') {
342
- return {
343
- ...this.getGeneralSnapshotMixins(snapshot),
344
- html: snapshot.html,
345
- toString() {
346
- return this.html;
347
- }
348
- } as FormattedPage;
349
- }
350
-
351
- let pdfMode = false;
352
- if (snapshot.pdfs?.length && !snapshot.title) {
353
- const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
354
- this.threadLocal.get('cacheTolerance')
355
- );
356
- if (pdf) {
357
- pdfMode = true;
358
- snapshot.title = pdf.meta?.Title;
359
- snapshot.text = pdf.text || snapshot.text;
360
- snapshot.parsed = {
361
- content: pdf.content,
362
- textContent: pdf.content,
363
- length: pdf.content?.length,
364
- byline: pdf.meta?.Author,
365
- lang: pdf.meta?.Language || undefined,
366
- title: pdf.meta?.Title,
367
- publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
368
- };
369
- }
370
- }
371
-
372
- if (mode === 'text') {
373
- return {
374
- ...this.getGeneralSnapshotMixins(snapshot),
375
- text: snapshot.text,
376
- toString() {
377
- return this.text;
378
- }
379
- } as FormattedPage;
380
- }
381
- const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
382
-
383
- let contentText = '';
384
- const imageSummary = {} as { [k: string]: string; };
385
- const imageIdxTrack = new Map<string, number[]>();
386
- const uid = this.threadLocal.get('uid');
387
- do {
388
- if (pdfMode) {
389
- contentText = snapshot.parsed?.content || snapshot.text;
390
- break;
391
- }
392
-
393
- if (
394
- snapshot.maxElemDepth! > 256 ||
395
- (!uid && snapshot.elemCount! > 10_000) ||
396
- snapshot.elemCount! > 70_000
397
- ) {
398
- this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
399
- contentText = snapshot.text;
400
- break;
401
- }
402
-
403
- const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
404
- let toBeTurnedToMd = jsDomElementOfHTML;
405
- let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
406
- if (mode !== 'markdown' && snapshot.parsed?.content) {
407
- const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
408
- const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
409
- const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
410
-
411
- // If Readability did its job
412
- if (par2.length >= 0.3 * par1.length) {
413
- turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
414
- if (snapshot.parsed.content) {
415
- toBeTurnedToMd = jsDomElementOfParsed;
416
- }
417
- }
418
- }
419
-
420
- for (const plugin of this.turnDownPlugins) {
421
- turnDownService = turnDownService.use(plugin);
422
- }
423
- const urlToAltMap: { [k: string]: string | undefined; } = {};
424
- if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
425
- const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
426
- const r = await this.altTextService.getAltText(x).catch((err: any) => {
427
- this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
428
- return undefined;
429
- });
430
- if (r && x.src) {
431
- urlToAltMap[x.src.trim()] = r;
432
- }
433
- });
434
-
435
- await Promise.all(tasks);
436
- }
437
- let imgIdx = 0;
438
- turnDownService.addRule('img-generated-alt', {
439
- filter: 'img',
440
- replacement: (_content, node: any) => {
441
- let linkPreferredSrc = (node.getAttribute('src') || '').trim();
442
- if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
443
- const dataSrc = (node.getAttribute('data-src') || '').trim();
444
- if (dataSrc && !dataSrc.startsWith('data:')) {
445
- linkPreferredSrc = dataSrc;
446
- }
447
- }
448
-
449
- let src;
450
- try {
451
- src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
452
- } catch (_err) {
453
- void 0;
454
- }
455
- const alt = cleanAttribute(node.getAttribute('alt'));
456
- if (!src) {
457
- return '';
458
- }
459
- const mapped = urlToAltMap[src];
460
- const imgSerial = ++imgIdx;
461
- const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
462
- idxArr.push(imgSerial);
463
- imageIdxTrack.set(src, idxArr);
464
-
465
- if (mapped) {
466
- imageSummary[src] = mapped || alt;
467
-
468
- if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
469
- const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
470
- mappedUrl.protocol = 'blob:';
471
-
472
- return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
473
- }
474
-
475
- return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
476
- }
477
-
478
- imageSummary[src] = alt || '';
479
-
480
- if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
481
- const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
482
- mappedUrl.protocol = 'blob:';
483
-
484
- return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
485
- }
486
-
487
- return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
488
- }
489
- });
490
-
491
- if (toBeTurnedToMd) {
492
- try {
493
- contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
494
- } catch (err) {
495
- this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
496
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
497
- try {
498
- contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
499
- } catch (err2) {
500
- this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
501
- }
502
- }
503
- }
504
-
505
- if (
506
- !contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
507
- && toBeTurnedToMd !== jsDomElementOfHTML
508
- ) {
509
- try {
510
- contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
511
- } catch (err) {
512
- this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
513
- const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
514
- try {
515
- contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
516
- } catch (err2) {
517
- this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
518
- }
519
- }
520
- }
521
- if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
522
- contentText = snapshot.text;
523
- }
524
- } while (false);
525
-
526
- const cleanText = (contentText || '').trim();
527
-
528
- const formatted: FormattedPage = {
529
- title: (snapshot.parsed?.title || snapshot.title || '').trim(),
530
- url: nominalUrl?.toString() || snapshot.href?.trim(),
531
- content: cleanText,
532
- publishedTime: snapshot.parsed?.publishedTime || undefined,
533
-
534
- toString() {
535
- if (mode === 'markdown') {
536
- return this.content as string;
537
- }
538
-
539
- const mixins = [];
540
- if (this.publishedTime) {
541
- mixins.push(`Published Time: ${this.publishedTime}`);
542
- }
543
- const suffixMixins = [];
544
- if (this.images) {
545
- const imageSummaryChunks = ['Images:'];
546
- for (const [k, v] of Object.entries(this.images)) {
547
- imageSummaryChunks.push(`- ![${k}](${v})`);
548
- }
549
- if (imageSummaryChunks.length === 1) {
550
- imageSummaryChunks.push('This page does not seem to contain any images.');
551
- }
552
- suffixMixins.push(imageSummaryChunks.join('\n'));
553
- }
554
- if (this.links) {
555
- const linkSummaryChunks = ['Links/Buttons:'];
556
- for (const [k, v] of Object.entries(this.links)) {
557
- linkSummaryChunks.push(`- [${k}](${v})`);
558
- }
559
- if (linkSummaryChunks.length === 1) {
560
- linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
561
- }
562
- suffixMixins.push(linkSummaryChunks.join('\n'));
563
- }
564
-
565
- return `Title: ${this.title}
566
-
567
- URL Source: ${this.url}
568
- ${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
569
- Markdown Content:
570
- ${this.content}
571
- ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
572
- }
573
- };
574
-
575
- if (this.threadLocal.get('withImagesSummary')) {
576
- formatted.images =
577
- _(imageSummary)
578
- .toPairs()
579
- .map(
580
- ([url, alt], i) => {
581
- return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
582
- }
583
- ).fromPairs()
584
- .value();
585
- }
586
- if (this.threadLocal.get('withLinksSummary')) {
587
- formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
588
- }
589
-
590
- return formatted as FormattedPage;
591
- }
592
-
593
  @CloudHTTPv2({
594
  name: 'crawl2',
595
  runtime: {
@@ -604,7 +130,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
604
  })
605
  @CloudHTTPv2({
606
  runtime: {
607
- memory: '4GiB',
608
  cpu: 4,
609
  timeoutSeconds: 300,
610
  concurrency: 22,
@@ -723,7 +249,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
723
  continue;
724
  }
725
 
726
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
727
  chargeAmount = this.assignChargeAmount(formatted);
728
  sseStream.write({
729
  event: 'data',
@@ -754,7 +280,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
754
  continue;
755
  }
756
 
757
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
758
  chargeAmount = this.assignChargeAmount(formatted);
759
 
760
  if (crawlerOptions.timeout === undefined) {
@@ -770,7 +296,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
770
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
771
  }
772
 
773
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
774
  chargeAmount = this.assignChargeAmount(formatted);
775
 
776
  return formatted;
@@ -782,24 +308,24 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
782
  continue;
783
  }
784
 
785
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
786
  chargeAmount = this.assignChargeAmount(formatted);
787
 
788
  if (crawlerOptions.timeout === undefined) {
789
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
790
 
791
- return assignTransferProtocolMeta(`${formatted}`,
792
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
793
  );
794
  }
795
  if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
796
 
797
- return assignTransferProtocolMeta(`${formatted}`,
798
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
799
  );
800
  }
801
 
802
- return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
803
  }
804
  }
805
 
@@ -807,22 +333,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
807
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
808
  }
809
 
810
- const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
811
  chargeAmount = this.assignChargeAmount(formatted);
812
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
813
 
814
- return assignTransferProtocolMeta(`${formatted}`,
815
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
816
  );
817
  }
818
  if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
819
 
820
- return assignTransferProtocolMeta(`${formatted}`,
821
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
822
  );
823
  }
824
 
825
- return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
826
  }
827
 
828
  async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
@@ -1181,7 +707,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
1181
 
1182
  } catch (err) {
1183
  if (lastSnapshot) {
1184
- return this.formatSnapshot(mode, lastSnapshot, url);
1185
  }
1186
 
1187
  throw err;
@@ -1191,6 +717,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
1191
  throw new AssertionFailureError(`No content available`);
1192
  }
1193
 
1194
- return this.formatSnapshot(mode, lastSnapshot, url);
1195
  }
1196
  }
 
1
  import {
2
  assignTransferProtocolMeta, marshalErrorLike,
3
  RPCHost, RPCReflection,
 
4
  AssertionFailureError, ParamValidationError, Defer,
5
  } from 'civkit';
6
  import { singleton } from 'tsyringe';
 
10
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
11
  import { Request, Response } from 'express';
12
  const pNormalizeUrl = import("@esm2cjs/normalize-url");
 
 
13
  import { Crawled } from '../db/crawled';
 
14
  import { randomUUID } from 'crypto';
15
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
16
 
17
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
18
  import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
19
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 
20
  import { DomainBlockade } from '../db/domain-blockade';
21
  import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
22
  import { JSDomControl } from '../services/jsdom';
23
+ import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
 
24
 
25
  export interface ExtraScrappingOptions extends ScrappingOptions {
26
  withIframe?: boolean;
 
29
  keepImgDataUrl?: boolean;
30
  }
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  const indexProto = {
33
  toString: function (): string {
34
  return _(this)
 
43
  export class CrawlerHost extends RPCHost {
44
  logger = this.globalLogger.child({ service: this.constructor.name });
45
 
 
 
46
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
47
  cacheValidMs = 1000 * 3600;
48
  urlValidMs = 1000 * 3600 * 4;
 
52
  protected globalLogger: Logger,
53
  protected puppeteerControl: PuppeteerControl,
54
  protected jsdomControl: JSDomControl,
55
+ protected snapshotFormatter: SnapshotFormatter,
 
56
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
57
  protected rateLimitControl: RateLimitControl,
58
  protected threadLocal: AsyncContext,
 
116
  return indexObject;
117
  }
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  @CloudHTTPv2({
120
  name: 'crawl2',
121
  runtime: {
 
130
  })
131
  @CloudHTTPv2({
132
  runtime: {
133
+ memory: '8GiB',
134
  cpu: 4,
135
  timeoutSeconds: 300,
136
  concurrency: 22,
 
249
  continue;
250
  }
251
 
252
+ const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
253
  chargeAmount = this.assignChargeAmount(formatted);
254
  sseStream.write({
255
  event: 'data',
 
280
  continue;
281
  }
282
 
283
+ const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
284
  chargeAmount = this.assignChargeAmount(formatted);
285
 
286
  if (crawlerOptions.timeout === undefined) {
 
296
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
297
  }
298
 
299
+ const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
300
  chargeAmount = this.assignChargeAmount(formatted);
301
 
302
  return formatted;
 
308
  continue;
309
  }
310
 
311
+ const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
312
  chargeAmount = this.assignChargeAmount(formatted);
313
 
314
  if (crawlerOptions.timeout === undefined) {
315
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
316
 
317
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
318
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
319
  );
320
  }
321
  if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
322
 
323
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
324
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
325
  );
326
  }
327
 
328
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
329
  }
330
  }
331
 
 
333
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
334
  }
335
 
336
+ const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
337
  chargeAmount = this.assignChargeAmount(formatted);
338
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
339
 
340
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
341
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
342
  );
343
  }
344
  if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
345
 
346
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
347
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
348
  );
349
  }
350
 
351
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
352
  }
353
 
354
  async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
 
707
 
708
  } catch (err) {
709
  if (lastSnapshot) {
710
+ return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
711
  }
712
 
713
  throw err;
 
717
  throw new AssertionFailureError(`No content available`);
718
  }
719
 
720
+ return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
721
  }
722
  }
backend/functions/src/cloud-functions/data-crunching.ts CHANGED
@@ -18,6 +18,7 @@ import { appendFile } from 'fs/promises';
18
  import { createGzip } from 'zlib';
19
  import { getFunctions } from 'firebase-admin/functions';
20
  import { GoogleAuth } from 'google-auth-library';
 
21
 
22
  dayjs.extend(require('dayjs/plugin/utc'));
23
 
@@ -57,6 +58,7 @@ export class DataCrunchingHost extends RPCHost {
57
  protected globalLogger: Logger,
58
 
59
  protected crawler: CrawlerHost,
 
60
  protected tempFileManager: TempFileManager,
61
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
62
  ) {
@@ -265,9 +267,9 @@ export class DataCrunchingHost extends RPCHost {
265
  try {
266
  const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
267
 
268
- let formatted = await this.crawler.formatSnapshot('default', snapshot);
269
  if (!formatted.content) {
270
- formatted = await this.crawler.formatSnapshot('markdown', snapshot);
271
  }
272
 
273
  await nextDrainDeferred.promise;
 
18
  import { createGzip } from 'zlib';
19
  import { getFunctions } from 'firebase-admin/functions';
20
  import { GoogleAuth } from 'google-auth-library';
21
+ import { SnapshotFormatter } from '../services/snapshot-formatter';
22
 
23
  dayjs.extend(require('dayjs/plugin/utc'));
24
 
 
58
  protected globalLogger: Logger,
59
 
60
  protected crawler: CrawlerHost,
61
+ protected snapshotFormatter: SnapshotFormatter,
62
  protected tempFileManager: TempFileManager,
63
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
64
  ) {
 
267
  try {
268
  const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
269
 
270
+ let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot);
271
  if (!formatted.content) {
272
+ formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
273
  }
274
 
275
  await nextDrainDeferred.promise;
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -11,11 +11,12 @@ import _ from 'lodash';
11
  import { Request, Response } from 'express';
12
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
13
  import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
14
- import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
15
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
16
  import { SearchResult } from '../db/searched';
17
  import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
18
  import { CrawlerOptions } from '../dto/scrapping-options';
 
19
 
20
 
21
  @singleton()
@@ -36,6 +37,7 @@ export class SearcherHost extends RPCHost {
36
  protected threadLocal: AsyncContext,
37
  protected braveSearchService: BraveSearchService,
38
  protected crawler: CrawlerHost,
 
39
  ) {
40
  super(...arguments);
41
  }
@@ -324,7 +326,7 @@ export class SearcherHost extends RPCHost {
324
  if (snapshotMap.has(x)) {
325
  return snapshotMap.get(x);
326
  }
327
- return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
328
  r.title ??= upstreamSearchResult.title;
329
  r.description = upstreamSearchResult.description;
330
  snapshotMap.set(x, r);
 
11
  import { Request, Response } from 'express';
12
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
13
  import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
14
+ import { CrawlerHost, ExtraScrappingOptions } from './crawler';
15
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
16
  import { SearchResult } from '../db/searched';
17
  import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
18
  import { CrawlerOptions } from '../dto/scrapping-options';
19
+ import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
20
 
21
 
22
  @singleton()
 
37
  protected threadLocal: AsyncContext,
38
  protected braveSearchService: BraveSearchService,
39
  protected crawler: CrawlerHost,
40
+ protected snapshotFormatter: SnapshotFormatter,
41
  ) {
42
  super(...arguments);
43
  }
 
326
  if (snapshotMap.has(x)) {
327
  return snapshotMap.get(x);
328
  }
329
+ return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
330
  r.title ??= upstreamSearchResult.title;
331
  r.description = upstreamSearchResult.description;
332
  snapshotMap.set(x, r);
backend/functions/src/services/jsdom.ts CHANGED
@@ -2,18 +2,19 @@ import { container, singleton } from 'tsyringe';
2
  import { AsyncService, marshalErrorLike } from 'civkit';
3
  import { Logger } from '../shared/services/logger';
4
  import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
5
- import { JSDOM, VirtualConsole } from 'jsdom';
6
  import { Readability } from '@mozilla/readability';
7
  import TurndownService from 'turndown';
 
8
 
9
- const virtualConsole = new VirtualConsole();
10
- virtualConsole.on('error', () => void 0);
11
 
12
  @singleton()
13
  export class JSDomControl extends AsyncService {
14
 
15
  logger = this.globalLogger.child({ service: this.constructor.name });
16
 
 
 
17
  constructor(
18
  protected globalLogger: Logger,
19
  ) {
@@ -22,22 +23,34 @@ export class JSDomControl extends AsyncService {
22
 
23
  override async init() {
24
  await this.dependencyReady();
 
25
  this.emit('ready');
26
  }
27
 
28
- narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
29
  targetSelector?: string | string[];
30
  removeSelector?: string | string[];
31
  withIframe?: boolean;
32
- }): PageSnapshot | undefined {
33
  if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
34
  return snapshot;
35
  }
36
  if (!snapshot?.html) {
37
  return snapshot;
38
  }
 
 
 
 
 
 
 
 
 
 
 
39
  const t0 = Date.now();
40
- const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
41
  const allNodes: Node[] = [];
42
  jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
43
  if (options?.withIframe) {
@@ -90,16 +103,16 @@ export class JSDomControl extends AsyncService {
90
  let rootDoc: Document;
91
  if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
92
  rootDoc = allNodes[0] as any;
93
- if (rootDoc.body.textContent) {
94
- textChunks.push(rootDoc.body.textContent);
95
  }
96
  } else {
97
- rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
98
  for (const n of allNodes) {
99
  rootDoc.body.appendChild(n);
100
  rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
101
- if (n.textContent) {
102
- textChunks.push(n.textContent);
103
  }
104
  }
105
  }
@@ -111,11 +124,6 @@ export class JSDomControl extends AsyncService {
111
  this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
112
  }
113
 
114
- // No innerText in jsdom
115
- // https://github.com/jsdom/jsdom/issues/1245
116
- const textContent = textChunks.join('\n\n');
117
- const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
118
-
119
  const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
120
  .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
121
  .flat()
@@ -135,7 +143,7 @@ export class JSDomControl extends AsyncService {
135
  title: snapshot.title || jsdom.window.document.title,
136
  parsed,
137
  html: rootDoc.documentElement.outerHTML,
138
- text: cleanedText,
139
  imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
140
  } as PageSnapshot;
141
 
@@ -147,11 +155,13 @@ export class JSDomControl extends AsyncService {
147
  return r;
148
  }
149
 
 
150
  inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
151
  const t0 = Date.now();
152
  const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
153
  try {
154
- const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
 
155
  jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
156
  const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
157
  .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
@@ -207,9 +217,8 @@ export class JSDomControl extends AsyncService {
207
 
208
  return extendedSnapshot;
209
  }
210
-
211
  snippetToElement(snippet?: string, url?: string) {
212
- const parsed = new JSDOM(snippet || '', { url, virtualConsole });
213
 
214
  return parsed.window.document.documentElement;
215
  }
 
2
  import { AsyncService, marshalErrorLike } from 'civkit';
3
  import { Logger } from '../shared/services/logger';
4
  import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
 
5
  import { Readability } from '@mozilla/readability';
6
  import TurndownService from 'turndown';
7
+ import { Threaded } from '../shared/services/threaded';
8
 
9
+ const pLinkedom = import('linkedom');
 
10
 
11
  @singleton()
12
  export class JSDomControl extends AsyncService {
13
 
14
  logger = this.globalLogger.child({ service: this.constructor.name });
15
 
16
+ linkedom!: Awaited<typeof pLinkedom>;
17
+
18
  constructor(
19
  protected globalLogger: Logger,
20
  ) {
 
23
 
24
  override async init() {
25
  await this.dependencyReady();
26
+ this.linkedom = await pLinkedom;
27
  this.emit('ready');
28
  }
29
 
30
+ async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
31
  targetSelector?: string | string[];
32
  removeSelector?: string | string[];
33
  withIframe?: boolean;
34
+ }) {
35
  if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
36
  return snapshot;
37
  }
38
  if (!snapshot?.html) {
39
  return snapshot;
40
  }
41
+
42
+ return this.actualNarrowSnapshot(snapshot, options);
43
+ }
44
+
45
+ @Threaded()
46
+ async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
47
+ targetSelector?: string | string[];
48
+ removeSelector?: string | string[];
49
+ withIframe?: boolean;
50
+ }): Promise<PageSnapshot | undefined> {
51
+
52
  const t0 = Date.now();
53
+ const jsdom = this.linkedom.parseHTML(snapshot.html);
54
  const allNodes: Node[] = [];
55
  jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
56
  if (options?.withIframe) {
 
103
  let rootDoc: Document;
104
  if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
105
  rootDoc = allNodes[0] as any;
106
+ if (rootDoc.body.innerText) {
107
+ textChunks.push(rootDoc.body.innerText);
108
  }
109
  } else {
110
+ rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
111
  for (const n of allNodes) {
112
  rootDoc.body.appendChild(n);
113
  rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
114
+ if ((n as HTMLElement).innerText) {
115
+ textChunks.push((n as HTMLElement).innerText);
116
  }
117
  }
118
  }
 
124
  this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
125
  }
126
 
 
 
 
 
 
127
  const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
128
  .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
129
  .flat()
 
143
  title: snapshot.title || jsdom.window.document.title,
144
  parsed,
145
  html: rootDoc.documentElement.outerHTML,
146
+ text: textChunks.join('\n'),
147
  imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
148
  } as PageSnapshot;
149
 
 
155
  return r;
156
  }
157
 
158
+ @Threaded()
159
  inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
160
  const t0 = Date.now();
161
  const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
162
  try {
163
+ const jsdom = this.linkedom.parseHTML(snapshot.html);
164
+
165
  jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
166
  const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
167
  .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
 
217
 
218
  return extendedSnapshot;
219
  }
 
220
  snippetToElement(snippet?: string, url?: string) {
221
+ const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
222
 
223
  return parsed.window.document.documentElement;
224
  }
backend/functions/src/services/puppeteer.ts CHANGED
@@ -1,7 +1,7 @@
1
  import os from 'os';
2
  import fs from 'fs';
3
  import { container, singleton } from 'tsyringe';
4
- import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred, perNextTick } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
 
7
  import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
@@ -207,7 +207,6 @@ export class PuppeteerControl extends AsyncService {
207
  browser!: Browser;
208
  logger = this.globalLogger.child({ service: this.constructor.name });
209
 
210
- private __healthCheckInterval?: NodeJS.Timeout;
211
  private __reqCapInterval?: NodeJS.Timeout;
212
 
213
  __loadedPage: Page[] = [];
@@ -217,7 +216,7 @@ export class PuppeteerControl extends AsyncService {
217
  livePages = new Set<Page>();
218
  lastPageCratedAt: number = 0;
219
 
220
- rpsCap: number = 300;
221
  lastReqSentAt: number = 0;
222
  requestDeferredQueue: Deferred<boolean>[] = [];
223
 
@@ -235,15 +234,7 @@ export class PuppeteerControl extends AsyncService {
235
  });
236
  }
237
 
238
- briefPages() {
239
- this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
240
- }
241
-
242
  override async init() {
243
- if (this.__healthCheckInterval) {
244
- clearInterval(this.__healthCheckInterval);
245
- this.__healthCheckInterval = undefined;
246
- }
247
  if (this.__reqCapInterval) {
248
  clearInterval(this.__reqCapInterval);
249
  this.__reqCapInterval = undefined;
@@ -276,40 +267,9 @@ export class PuppeteerControl extends AsyncService {
276
 
277
  this.emit('ready');
278
 
279
- this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
280
  this.newPage().then((r) => this.__loadedPage.push(r));
281
  }
282
 
283
- @maxConcurrency(1)
284
- async healthCheck() {
285
- if (Date.now() - this.lastPageCratedAt <= 10_000) {
286
- this.briefPages();
287
- return;
288
- }
289
- const healthyPage = await this.newPage().catch((err) => {
290
- this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
291
- return null;
292
- });
293
-
294
- if (healthyPage) {
295
- this.__loadedPage.push(healthyPage);
296
-
297
- if (this.__loadedPage.length > 3) {
298
- this.ditchPage(this.__loadedPage.shift()!);
299
- }
300
-
301
- this.briefPages();
302
-
303
- return;
304
- }
305
-
306
- this.logger.warn(`Trying to clean up...`);
307
- this.browser.process()?.kill('SIGKILL');
308
- Reflect.deleteProperty(this, 'browser');
309
- this.emit('crippled');
310
- this.logger.warn(`Browser killed`);
311
- }
312
-
313
  @perNextTick()
314
  reqCapRoutine() {
315
  const now = Date.now();
@@ -620,7 +580,7 @@ document.addEventListener('load', handlePageLoad);
620
  try {
621
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
622
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
623
- screenshot = await page.screenshot();
624
  if (snapshot) {
625
  snapshot.childFrames = await pSubFrameSnapshots;
626
  }
@@ -643,8 +603,8 @@ document.addEventListener('load', handlePageLoad);
643
  if (salvaged) {
644
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
645
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
646
- screenshot = await page.screenshot();
647
- pageshot = await page.screenshot({ fullPage: true });
648
  if (snapshot) {
649
  snapshot.childFrames = await pSubFrameSnapshots;
650
  }
@@ -678,8 +638,8 @@ document.addEventListener('load', handlePageLoad);
678
  .then(async () => {
679
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
680
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
681
- screenshot = await page.screenshot();
682
- pageshot = await page.screenshot({ fullPage: true });
683
  if (snapshot) {
684
  snapshot.childFrames = await pSubFrameSnapshots;
685
  }
@@ -716,8 +676,8 @@ document.addEventListener('load', handlePageLoad);
716
  break;
717
  }
718
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
719
- screenshot = await page.screenshot();
720
- pageshot = await page.screenshot({ fullPage: true });
721
  lastHTML = snapshot.html;
722
  }
723
  if (snapshot || screenshot) {
 
1
  import os from 'os';
2
  import fs from 'fs';
3
  import { container, singleton } from 'tsyringe';
4
+ import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
 
7
  import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
 
207
  browser!: Browser;
208
  logger = this.globalLogger.child({ service: this.constructor.name });
209
 
 
210
  private __reqCapInterval?: NodeJS.Timeout;
211
 
212
  __loadedPage: Page[] = [];
 
216
  livePages = new Set<Page>();
217
  lastPageCratedAt: number = 0;
218
 
219
+ rpsCap: number = 500;
220
  lastReqSentAt: number = 0;
221
  requestDeferredQueue: Deferred<boolean>[] = [];
222
 
 
234
  });
235
  }
236
 
 
 
 
 
237
  override async init() {
 
 
 
 
238
  if (this.__reqCapInterval) {
239
  clearInterval(this.__reqCapInterval);
240
  this.__reqCapInterval = undefined;
 
267
 
268
  this.emit('ready');
269
 
 
270
  this.newPage().then((r) => this.__loadedPage.push(r));
271
  }
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  @perNextTick()
274
  reqCapRoutine() {
275
  const now = Date.now();
 
580
  try {
581
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
582
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
583
+ screenshot = Buffer.from(await page.screenshot());
584
  if (snapshot) {
585
  snapshot.childFrames = await pSubFrameSnapshots;
586
  }
 
603
  if (salvaged) {
604
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
605
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
606
+ screenshot = Buffer.from(await page.screenshot());
607
+ pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
608
  if (snapshot) {
609
  snapshot.childFrames = await pSubFrameSnapshots;
610
  }
 
638
  .then(async () => {
639
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
640
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
641
+ screenshot = Buffer.from(await page.screenshot());
642
+ pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
643
  if (snapshot) {
644
  snapshot.childFrames = await pSubFrameSnapshots;
645
  }
 
676
  break;
677
  }
678
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
679
+ screenshot = Buffer.from(await page.screenshot());
680
+ pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
681
  lastHTML = snapshot.html;
682
  }
683
  if (snapshot || screenshot) {
backend/functions/src/services/snapshot-formatter.ts ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { randomUUID } from 'crypto';
2
+ import { container, singleton } from 'tsyringe';
3
+ import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
4
+ import TurndownService from 'turndown';
5
+ import { Logger } from '../shared/services/logger';
6
+ import { PageSnapshot } from './puppeteer';
7
+ import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
8
+ import { AsyncContext } from '../shared/services/async-context';
9
+ import { Threaded } from '../shared/services/threaded';
10
+ import { JSDomControl } from './jsdom';
11
+ import { AltTextService } from './alt-text';
12
+ import { PDFExtractor } from './pdf-extract';
13
+ import { cleanAttribute } from '../utils/misc';
14
+ import _ from 'lodash';
15
+
16
+
17
+ export interface FormattedPage {
18
+ title?: string;
19
+ description?: string;
20
+ url?: string;
21
+ content?: string;
22
+ publishedTime?: string;
23
+ html?: string;
24
+ text?: string;
25
+ screenshotUrl?: string;
26
+ screenshot?: Buffer;
27
+ pageshotUrl?: string;
28
+ pageshot?: Buffer;
29
+ links?: { [k: string]: string; };
30
+ images?: { [k: string]: string; };
31
+ usage?: {
32
+ total_tokens?: number;
33
+ totalTokens?: number;
34
+ tokens?: number;
35
+ };
36
+
37
+ textRepresentation?: string;
38
+
39
+ [Symbol.dispose]: () => void;
40
+ }
41
+
42
+ export const md5Hasher = new HashManager('md5', 'hex');
43
+
44
+ @singleton()
45
+ export class SnapshotFormatter extends AsyncService {
46
+
47
+ logger = this.globalLogger.child({ service: this.constructor.name });
48
+
49
+ turnDownPlugins = [require('turndown-plugin-gfm').tables];
50
+
51
+ constructor(
52
+ protected globalLogger: Logger,
53
+ protected jsdomControl: JSDomControl,
54
+ protected altTextService: AltTextService,
55
+ protected pdfExtractor: PDFExtractor,
56
+ protected threadLocal: AsyncContext,
57
+ protected firebaseObjectStorage: FirebaseStorageBucketControl,
58
+ ) {
59
+ super(...arguments);
60
+ }
61
+
62
+ override async init() {
63
+ await this.dependencyReady();
64
+ this.emit('ready');
65
+ }
66
+
67
+
68
+ @Threaded()
69
+ async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
70
+ screenshotUrl?: string;
71
+ pageshotUrl?: string;
72
+ }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
73
+ const t0 = Date.now();
74
+ if (mode === 'screenshot') {
75
+ if (snapshot.screenshot && !snapshot.screenshotUrl) {
76
+ const fid = `instant-screenshots/${randomUUID()}`;
77
+ await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
78
+ metadata: {
79
+ contentType: 'image/png',
80
+ }
81
+ });
82
+ snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
83
+ }
84
+
85
+ const f = {
86
+ ...this.getGeneralSnapshotMixins(snapshot),
87
+ // html: snapshot.html,
88
+ screenshotUrl: snapshot.screenshotUrl,
89
+ };
90
+
91
+ Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false });
92
+
93
+ const dt = Date.now() - t0;
94
+ this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
95
+
96
+ return f as FormattedPage;
97
+ }
98
+ if (mode === 'pageshot') {
99
+ if (snapshot.pageshot && !snapshot.pageshotUrl) {
100
+ const fid = `instant-screenshots/${randomUUID()}`;
101
+ await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
102
+ metadata: {
103
+ contentType: 'image/png',
104
+ }
105
+ });
106
+ snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
107
+ }
108
+
109
+ const f = {
110
+ ...this.getGeneralSnapshotMixins(snapshot),
111
+ html: snapshot.html,
112
+ pageshotUrl: snapshot.pageshotUrl,
113
+ } as FormattedPage;
114
+
115
+ Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false });
116
+
117
+ const dt = Date.now() - t0;
118
+ this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
119
+
120
+ return f;
121
+ }
122
+ if (mode === 'html') {
123
+ const f = {
124
+ ...this.getGeneralSnapshotMixins(snapshot),
125
+ html: snapshot.html,
126
+ } as FormattedPage;
127
+
128
+ Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false });
129
+
130
+ const dt = Date.now() - t0;
131
+ this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
132
+
133
+ return f;
134
+ }
135
+
136
+ let pdfMode = false;
137
+ if (snapshot.pdfs?.length && !snapshot.title) {
138
+ const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
139
+ this.threadLocal.get('cacheTolerance')
140
+ );
141
+ if (pdf) {
142
+ pdfMode = true;
143
+ snapshot.title = pdf.meta?.Title;
144
+ snapshot.text = pdf.text || snapshot.text;
145
+ snapshot.parsed = {
146
+ content: pdf.content,
147
+ textContent: pdf.content,
148
+ length: pdf.content?.length,
149
+ byline: pdf.meta?.Author,
150
+ lang: pdf.meta?.Language || undefined,
151
+ title: pdf.meta?.Title,
152
+ publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
153
+ };
154
+ }
155
+ }
156
+
157
+ if (mode === 'text') {
158
+ const f = {
159
+ ...this.getGeneralSnapshotMixins(snapshot),
160
+ text: snapshot.text,
161
+ } as FormattedPage;
162
+
163
+ Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false });
164
+
165
+ const dt = Date.now() - t0;
166
+ this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
167
+
168
+ return f;
169
+ }
170
+ const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
171
+
172
+ let contentText = '';
173
+ const imageSummary = {} as { [k: string]: string; };
174
+ const imageIdxTrack = new Map<string, number[]>();
175
+ const uid = this.threadLocal.get('uid');
176
+ do {
177
+ if (pdfMode) {
178
+ contentText = snapshot.parsed?.content || snapshot.text;
179
+ break;
180
+ }
181
+
182
+ if (
183
+ snapshot.maxElemDepth! > 256 ||
184
+ (!uid && snapshot.elemCount! > 10_000) ||
185
+ snapshot.elemCount! > 70_000
186
+ ) {
187
+ this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
188
+ contentText = snapshot.text;
189
+ break;
190
+ }
191
+
192
+ const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
193
+ let toBeTurnedToMd = jsDomElementOfHTML;
194
+ let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
195
+ if (mode !== 'markdown' && snapshot.parsed?.content) {
196
+ const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
197
+ const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
198
+ const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
199
+
200
+ // If Readability did its job
201
+ if (par2.length >= 0.3 * par1.length) {
202
+ turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
203
+ if (snapshot.parsed.content) {
204
+ toBeTurnedToMd = jsDomElementOfParsed;
205
+ }
206
+ }
207
+ }
208
+
209
+ for (const plugin of this.turnDownPlugins) {
210
+ turnDownService = turnDownService.use(plugin);
211
+ }
212
+ const urlToAltMap: { [k: string]: string | undefined; } = {};
213
+ if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
214
+ const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
215
+ const r = await this.altTextService.getAltText(x).catch((err: any) => {
216
+ this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
217
+ return undefined;
218
+ });
219
+ if (r && x.src) {
220
+ urlToAltMap[x.src.trim()] = r;
221
+ }
222
+ });
223
+
224
+ await Promise.all(tasks);
225
+ }
226
+ let imgIdx = 0;
227
+ turnDownService.addRule('img-generated-alt', {
228
+ filter: 'img',
229
+ replacement: (_content, node: any) => {
230
+ let linkPreferredSrc = (node.getAttribute('src') || '').trim();
231
+ if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
232
+ const dataSrc = (node.getAttribute('data-src') || '').trim();
233
+ if (dataSrc && !dataSrc.startsWith('data:')) {
234
+ linkPreferredSrc = dataSrc;
235
+ }
236
+ }
237
+
238
+ let src;
239
+ try {
240
+ src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
241
+ } catch (_err) {
242
+ void 0;
243
+ }
244
+ const alt = cleanAttribute(node.getAttribute('alt'));
245
+ if (!src) {
246
+ return '';
247
+ }
248
+ const mapped = urlToAltMap[src];
249
+ const imgSerial = ++imgIdx;
250
+ const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
251
+ idxArr.push(imgSerial);
252
+ imageIdxTrack.set(src, idxArr);
253
+
254
+ if (mapped) {
255
+ imageSummary[src] = mapped || alt;
256
+
257
+ if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
258
+ const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
259
+ mappedUrl.protocol = 'blob:';
260
+
261
+ return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
262
+ }
263
+
264
+ return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
265
+ }
266
+
267
+ imageSummary[src] = alt || '';
268
+
269
+ if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
270
+ const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
271
+ mappedUrl.protocol = 'blob:';
272
+
273
+ return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
274
+ }
275
+
276
+ return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
277
+ }
278
+ });
279
+
280
+ if (toBeTurnedToMd) {
281
+ try {
282
+ contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
283
+ } catch (err) {
284
+ this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
285
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
286
+ try {
287
+ contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
288
+ } catch (err2) {
289
+ this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
290
+ }
291
+ }
292
+ }
293
+
294
+ if (
295
+ !contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
296
+ && toBeTurnedToMd !== jsDomElementOfHTML
297
+ ) {
298
+ try {
299
+ contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
300
+ } catch (err) {
301
+ this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
302
+ const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
303
+ try {
304
+ contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
305
+ } catch (err2) {
306
+ this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
307
+ }
308
+ }
309
+ }
310
+ if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
311
+ contentText = snapshot.text;
312
+ }
313
+ } while (false);
314
+
315
+ const cleanText = (contentText || '').trim();
316
+
317
+ const formatted: FormattedPage = {
318
+ title: (snapshot.parsed?.title || snapshot.title || '').trim(),
319
+ url: nominalUrl?.toString() || snapshot.href?.trim(),
320
+ content: cleanText,
321
+ publishedTime: snapshot.parsed?.publishedTime || undefined,
322
+ [Symbol.dispose]: () => { },
323
+ };
324
+
325
+ if (this.threadLocal.get('withImagesSummary')) {
326
+ formatted.images =
327
+ _(imageSummary)
328
+ .toPairs()
329
+ .map(
330
+ ([url, alt], i) => {
331
+ return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
332
+ }
333
+ ).fromPairs()
334
+ .value();
335
+ }
336
+ if (this.threadLocal.get('withLinksSummary')) {
337
+ formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
338
+ }
339
+
340
+ const textRepresentation = (function (this: typeof formatted) {
341
+ if (mode === 'markdown') {
342
+ return this.content as string;
343
+ }
344
+
345
+ const mixins = [];
346
+ if (this.publishedTime) {
347
+ mixins.push(`Published Time: ${this.publishedTime}`);
348
+ }
349
+ const suffixMixins = [];
350
+ if (this.images) {
351
+ const imageSummaryChunks = ['Images:'];
352
+ for (const [k, v] of Object.entries(this.images)) {
353
+ imageSummaryChunks.push(`- ![${k}](${v})`);
354
+ }
355
+ if (imageSummaryChunks.length === 1) {
356
+ imageSummaryChunks.push('This page does not seem to contain any images.');
357
+ }
358
+ suffixMixins.push(imageSummaryChunks.join('\n'));
359
+ }
360
+ if (this.links) {
361
+ const linkSummaryChunks = ['Links/Buttons:'];
362
+ for (const [k, v] of Object.entries(this.links)) {
363
+ linkSummaryChunks.push(`- [${k}](${v})`);
364
+ }
365
+ if (linkSummaryChunks.length === 1) {
366
+ linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
367
+ }
368
+ suffixMixins.push(linkSummaryChunks.join('\n'));
369
+ }
370
+
371
+ return `Title: ${this.title}
372
+
373
+ URL Source: ${this.url}
374
+ ${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
375
+ Markdown Content:
376
+ ${this.content}
377
+ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
378
+ }).call(formatted);
379
+
380
+ Object.defineProperty(formatted, 'textRepresentation', { value: textRepresentation, enumerable: false });
381
+
382
+ const dt = Date.now() - t0;
383
+ this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
384
+
385
+ return formatted as FormattedPage;
386
+ }
387
+
388
+ getGeneralSnapshotMixins(snapshot: PageSnapshot) {
389
+ let inferred;
390
+ const mixin: any = {};
391
+ if (this.threadLocal.get('withImagesSummary')) {
392
+ inferred ??= this.jsdomControl.inferSnapshot(snapshot);
393
+ const imageSummary = {} as { [k: string]: string; };
394
+ const imageIdxTrack = new Map<string, number[]>();
395
+
396
+ let imgIdx = 0;
397
+
398
+ for (const img of inferred.imgs) {
399
+ const imgSerial = ++imgIdx;
400
+ const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
401
+ idxArr.push(imgSerial);
402
+ imageIdxTrack.set(img.src, idxArr);
403
+ imageSummary[img.src] = img.alt || '';
404
+ }
405
+
406
+ mixin.images =
407
+ _(imageSummary)
408
+ .toPairs()
409
+ .map(
410
+ ([url, alt], i) => {
411
+ return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
412
+ }
413
+ ).fromPairs()
414
+ .value();
415
+ }
416
+ if (this.threadLocal.get('withLinksSummary')) {
417
+ inferred ??= this.jsdomControl.inferSnapshot(snapshot);
418
+ mixin.links = _.invert(inferred.links || {});
419
+ }
420
+
421
+ return mixin;
422
+ }
423
+
424
+ getTurndown(options?: {
425
+ noRules?: boolean | string,
426
+ url?: string | URL;
427
+ imgDataUrlToObjectUrl?: boolean;
428
+ }) {
429
+ const turnDownService = new TurndownService({
430
+ codeBlockStyle: 'fenced',
431
+ preformattedCode: true,
432
+ } as any);
433
+ if (!options?.noRules) {
434
+ turnDownService.addRule('remove-irrelevant', {
435
+ filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
436
+ replacement: () => ''
437
+ });
438
+ turnDownService.addRule('truncate-svg', {
439
+ filter: 'svg' as any,
440
+ replacement: () => ''
441
+ });
442
+ turnDownService.addRule('title-as-h1', {
443
+ filter: ['title'],
444
+ replacement: (innerText) => `${innerText}\n===============\n`
445
+ });
446
+ }
447
+
448
+ if (options?.imgDataUrlToObjectUrl) {
449
+ turnDownService.addRule('data-url-to-pseudo-object-url', {
450
+ filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
451
+ replacement: (_content, node: any) => {
452
+ const src = (node.getAttribute('src') || '').trim();
453
+ const alt = cleanAttribute(node.getAttribute('alt')) || '';
454
+
455
+ if (options.url) {
456
+ const refUrl = new URL(options.url);
457
+ const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
458
+
459
+ return `![${alt}](${mappedUrl})`;
460
+ }
461
+
462
+ return `![${alt}](blob:${md5Hasher.hash(src)})`;
463
+ }
464
+ });
465
+ }
466
+
467
+ turnDownService.addRule('improved-paragraph', {
468
+ filter: 'p',
469
+ replacement: (innerText) => {
470
+ const trimmed = innerText.trim();
471
+ if (!trimmed) {
472
+ return '';
473
+ }
474
+
475
+ return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
476
+ }
477
+ });
478
+ turnDownService.addRule('improved-inline-link', {
479
+ filter: function (node, options) {
480
+ return Boolean(
481
+ options.linkStyle === 'inlined' &&
482
+ node.nodeName === 'A' &&
483
+ node.getAttribute('href')
484
+ );
485
+ },
486
+
487
+ replacement: function (content, node: any) {
488
+ let href = node.getAttribute('href');
489
+ if (href) href = href.replace(/([()])/g, '\\$1');
490
+ let title = cleanAttribute(node.getAttribute('title'));
491
+ if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
492
+
493
+ const fixedContent = content.replace(/\s+/g, ' ').trim();
494
+ let fixedHref = href.replace(/\s+/g, '').trim();
495
+ if (options?.url) {
496
+ try {
497
+ fixedHref = new URL(fixedHref, options.url).toString();
498
+ } catch (_err) {
499
+ void 0;
500
+ }
501
+ }
502
+
503
+ return `[${fixedContent}](${fixedHref}${title || ''})`;
504
+ }
505
+ });
506
+ turnDownService.addRule('improved-code', {
507
+ filter: function (node: any) {
508
+ let hasSiblings = node.previousSibling || node.nextSibling;
509
+ let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
510
+
511
+ return node.nodeName === 'CODE' && !isCodeBlock;
512
+ },
513
+
514
+ replacement: function (inputContent: any) {
515
+ if (!inputContent) return '';
516
+ let content = inputContent;
517
+
518
+ let delimiter = '`';
519
+ let matches = content.match(/`+/gm) || [];
520
+ while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
521
+ if (content.includes('\n')) {
522
+ delimiter = '```';
523
+ }
524
+
525
+ let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
526
+
527
+ return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
528
+ }
529
+ });
530
+
531
+ return turnDownService;
532
+ }
533
+
534
+
535
+ }
536
+
537
+ const snapshotFormatter = container.resolve(SnapshotFormatter);
538
+
539
+ export default snapshotFormatter;