nomagick commited on
Commit
7ee2c32
·
unverified ·
1 Parent(s): e36d3b0

refactor: reorganize features (#37)

Browse files

* wip

* fix

* wip

* cleanup

* fix

* fix

* cache: may rescue using stale cache

* fix: target 384mb ram per page

* fix: log about pool size

* fix

* clean

* fix: cache and snapshot reporting

backend/firebase.json CHANGED
@@ -13,7 +13,8 @@
13
  ".git",
14
  "*.log",
15
  "*.local",
16
- ".secret.*"
 
17
  ],
18
  "predeploy": [
19
  "npm --prefix \"$RESOURCE_DIR\" run build:clean",
 
13
  ".git",
14
  "*.log",
15
  "*.local",
16
+ ".secret.*",
17
+ ".firebase-emu"
18
  ],
19
  "predeploy": [
20
  "npm --prefix \"$RESOURCE_DIR\" run build:clean",
backend/functions/package-lock.json CHANGED
@@ -29,7 +29,10 @@
29
  "puppeteer": "^22.6.3",
30
  "puppeteer-extra": "^3.3.6",
31
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
 
32
  "puppeteer-extra-plugin-stealth": "^2.11.2",
 
 
33
  "stripe": "^11.11.0",
34
  "tiktoken": "^1.0.10",
35
  "turndown": "^7.1.3",
@@ -42,6 +45,7 @@
42
  "@types/cors": "^2.8.17",
43
  "@types/generic-pool": "^3.8.1",
44
  "@types/node": "^18",
 
45
  "@typescript-eslint/eslint-plugin": "^5.12.0",
46
  "@typescript-eslint/parser": "^5.12.0",
47
  "eslint": "^8.9.0",
@@ -1986,6 +1990,17 @@
1986
  "dev": true,
1987
  "peer": true
1988
  },
 
 
 
 
 
 
 
 
 
 
 
1989
  "node_modules/@sinonjs/commons": {
1990
  "version": "3.0.1",
1991
  "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-3.0.1.tgz",
@@ -2006,6 +2021,17 @@
2006
  "@sinonjs/commons": "^3.0.0"
2007
  }
2008
  },
 
 
 
 
 
 
 
 
 
 
 
2009
  "node_modules/@tootallnate/once": {
2010
  "version": "2.0.0",
2011
  "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz",
@@ -2091,6 +2117,17 @@
2091
  "@types/node": "*"
2092
  }
2093
  },
 
 
 
 
 
 
 
 
 
 
 
2094
  "node_modules/@types/caseless": {
2095
  "version": "0.12.5",
2096
  "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz",
@@ -2161,6 +2198,11 @@
2161
  "@types/node": "*"
2162
  }
2163
  },
 
 
 
 
 
2164
  "node_modules/@types/http-errors": {
2165
  "version": "2.0.4",
2166
  "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
@@ -2213,6 +2255,14 @@
2213
  "@types/node": "*"
2214
  }
2215
  },
 
 
 
 
 
 
 
 
2216
  "node_modules/@types/lodash": {
2217
  "version": "4.17.0",
2218
  "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
@@ -2294,6 +2344,14 @@
2294
  "node": ">= 0.12"
2295
  }
2296
  },
 
 
 
 
 
 
 
 
2297
  "node_modules/@types/semver": {
2298
  "version": "7.5.8",
2299
  "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.8.tgz",
@@ -2319,6 +2377,15 @@
2319
  "@types/send": "*"
2320
  }
2321
  },
 
 
 
 
 
 
 
 
 
2322
  "node_modules/@types/stack-utils": {
2323
  "version": "2.0.3",
2324
  "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz",
@@ -3447,6 +3514,45 @@
3447
  "node": ">= 6.0.0"
3448
  }
3449
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3450
  "node_modules/call-bind": {
3451
  "version": "1.0.7",
3452
  "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
@@ -3642,6 +3748,25 @@
3642
  "node": ">=0.10.0"
3643
  }
3644
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3645
  "node_modules/co": {
3646
  "version": "4.6.0",
3647
  "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@@ -4003,6 +4128,17 @@
4003
  "url": "https://github.com/sponsors/sindresorhus"
4004
  }
4005
  },
 
 
 
 
 
 
 
 
 
 
 
4006
  "node_modules/dedent": {
4007
  "version": "1.5.3",
4008
  "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.5.3.tgz",
@@ -4046,6 +4182,14 @@
4046
  "node": ">=0.10.0"
4047
  }
4048
  },
 
 
 
 
 
 
 
 
4049
  "node_modules/define-data-property": {
4050
  "version": "1.1.4",
4051
  "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
@@ -5837,6 +5981,30 @@
5837
  "url": "https://github.com/sponsors/ljharb"
5838
  }
5839
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5840
  "node_modules/graceful-fs": {
5841
  "version": "4.2.11",
5842
  "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
@@ -6024,6 +6192,11 @@
6024
  "node": ">= 0.6"
6025
  }
6026
  },
 
 
 
 
 
6027
  "node_modules/http-errors": {
6028
  "version": "2.0.0",
6029
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
@@ -6067,6 +6240,18 @@
6067
  "node": ">= 14"
6068
  }
6069
  },
 
 
 
 
 
 
 
 
 
 
 
 
6070
  "node_modules/https-proxy-agent": {
6071
  "version": "5.0.1",
6072
  "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
@@ -7391,8 +7576,7 @@
7391
  "node_modules/json-buffer": {
7392
  "version": "3.0.1",
7393
  "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
7394
- "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==",
7395
- "dev": true
7396
  },
7397
  "node_modules/json-parse-even-better-errors": {
7398
  "version": "2.3.1",
@@ -7550,7 +7734,6 @@
7550
  "version": "4.5.4",
7551
  "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
7552
  "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
7553
- "dev": true,
7554
  "dependencies": {
7555
  "json-buffer": "3.0.1"
7556
  }
@@ -7869,6 +8052,14 @@
7869
  "integrity": "sha512-BFRuQUqc7x2NWxfJBCyUrN8iYUYznzL9JROmRz1gZ6KlOIgmoD+njPVbb+VNn2nGMKggMsK79iUNErillsrx7w==",
7870
  "optional": true
7871
  },
 
 
 
 
 
 
 
 
7872
  "node_modules/lru-cache": {
7873
  "version": "5.1.1",
7874
  "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
@@ -8054,17 +8245,6 @@
8054
  "node": ">=6"
8055
  }
8056
  },
8057
- "node_modules/mimic-response": {
8058
- "version": "3.1.0",
8059
- "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
8060
- "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
8061
- "engines": {
8062
- "node": ">=10"
8063
- },
8064
- "funding": {
8065
- "url": "https://github.com/sponsors/sindresorhus"
8066
- }
8067
- },
8068
  "node_modules/minimatch": {
8069
  "version": "3.1.2",
8070
  "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
@@ -8423,6 +8603,17 @@
8423
  "node": ">=0.10.0"
8424
  }
8425
  },
 
 
 
 
 
 
 
 
 
 
 
8426
  "node_modules/npm-run-path": {
8427
  "version": "4.0.1",
8428
  "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
@@ -8632,6 +8823,14 @@
8632
  "node": ">= 0.8.0"
8633
  }
8634
  },
 
 
 
 
 
 
 
 
8635
  "node_modules/p-limit": {
8636
  "version": "3.1.0",
8637
  "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
@@ -9243,6 +9442,11 @@
9243
  "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz",
9244
  "integrity": "sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ=="
9245
  },
 
 
 
 
 
9246
  "node_modules/pump": {
9247
  "version": "3.0.0",
9248
  "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
@@ -9367,6 +9571,58 @@
9367
  }
9368
  }
9369
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9370
  "node_modules/puppeteer-extra-plugin-stealth": {
9371
  "version": "2.11.2",
9372
  "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
@@ -9457,6 +9713,44 @@
9457
  }
9458
  }
9459
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9460
  "node_modules/pure-rand": {
9461
  "version": "6.1.0",
9462
  "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz",
@@ -9523,6 +9817,11 @@
9523
  "url": "https://github.com/sponsors/sindresorhus"
9524
  }
9525
  },
 
 
 
 
 
9526
  "node_modules/queue-microtask": {
9527
  "version": "1.2.3",
9528
  "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
@@ -9554,6 +9853,17 @@
9554
  "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
9555
  "optional": true
9556
  },
 
 
 
 
 
 
 
 
 
 
 
9557
  "node_modules/range-parser": {
9558
  "version": "1.2.1",
9559
  "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
@@ -9709,6 +10019,11 @@
9709
  "node": ">=0.10.0"
9710
  }
9711
  },
 
 
 
 
 
9712
  "node_modules/resolve": {
9713
  "version": "1.22.8",
9714
  "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz",
@@ -9726,6 +10041,11 @@
9726
  "url": "https://github.com/sponsors/ljharb"
9727
  }
9728
  },
 
 
 
 
 
9729
  "node_modules/resolve-cwd": {
9730
  "version": "3.0.0",
9731
  "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz",
@@ -9767,6 +10087,17 @@
9767
  "node": ">=10"
9768
  }
9769
  },
 
 
 
 
 
 
 
 
 
 
 
9770
  "node_modules/retry": {
9771
  "version": "0.13.1",
9772
  "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz",
@@ -10029,6 +10360,11 @@
10029
  "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
10030
  "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw=="
10031
  },
 
 
 
 
 
10032
  "node_modules/set-function-length": {
10033
  "version": "1.2.2",
10034
  "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
@@ -10767,6 +11103,28 @@
10767
  "node": ">=0.6"
10768
  }
10769
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10770
  "node_modules/tr46": {
10771
  "version": "3.0.0",
10772
  "resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz",
@@ -11127,6 +11485,15 @@
11127
  "punycode": "^2.1.0"
11128
  }
11129
  },
 
 
 
 
 
 
 
 
 
11130
  "node_modules/urlpattern-polyfill": {
11131
  "version": "10.0.0",
11132
  "resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
 
29
  "puppeteer": "^22.6.3",
30
  "puppeteer-extra": "^3.3.6",
31
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
32
+ "puppeteer-extra-plugin-page-proxy": "^2.0.0",
33
  "puppeteer-extra-plugin-stealth": "^2.11.2",
34
+ "puppeteer-page-proxy": "^1.3.0",
35
+ "set-cookie-parser": "^2.6.0",
36
  "stripe": "^11.11.0",
37
  "tiktoken": "^1.0.10",
38
  "turndown": "^7.1.3",
 
45
  "@types/cors": "^2.8.17",
46
  "@types/generic-pool": "^3.8.1",
47
  "@types/node": "^18",
48
+ "@types/set-cookie-parser": "^2.4.7",
49
  "@typescript-eslint/eslint-plugin": "^5.12.0",
50
  "@typescript-eslint/parser": "^5.12.0",
51
  "eslint": "^8.9.0",
 
1990
  "dev": true,
1991
  "peer": true
1992
  },
1993
+ "node_modules/@sindresorhus/is": {
1994
+ "version": "4.6.0",
1995
+ "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-4.6.0.tgz",
1996
+ "integrity": "sha512-t09vSN3MdfsyCHoFcTRCH/iUtG7OJ0CsjzB8cjAmKc/va/kIgeDI/TxsigdncE/4be734m0cvIYwNaV4i2XqAw==",
1997
+ "engines": {
1998
+ "node": ">=10"
1999
+ },
2000
+ "funding": {
2001
+ "url": "https://github.com/sindresorhus/is?sponsor=1"
2002
+ }
2003
+ },
2004
  "node_modules/@sinonjs/commons": {
2005
  "version": "3.0.1",
2006
  "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-3.0.1.tgz",
 
2021
  "@sinonjs/commons": "^3.0.0"
2022
  }
2023
  },
2024
+ "node_modules/@szmarczak/http-timer": {
2025
+ "version": "4.0.6",
2026
+ "resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz",
2027
+ "integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==",
2028
+ "dependencies": {
2029
+ "defer-to-connect": "^2.0.0"
2030
+ },
2031
+ "engines": {
2032
+ "node": ">=10"
2033
+ }
2034
+ },
2035
  "node_modules/@tootallnate/once": {
2036
  "version": "2.0.0",
2037
  "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz",
 
2117
  "@types/node": "*"
2118
  }
2119
  },
2120
+ "node_modules/@types/cacheable-request": {
2121
+ "version": "6.0.3",
2122
+ "resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
2123
+ "integrity": "sha512-IQ3EbTzGxIigb1I3qPZc1rWJnH0BmSKv5QYTalEwweFvyBDLSAe24zP0le/hyi7ecGfZVlIVAg4BZqb8WBwKqw==",
2124
+ "dependencies": {
2125
+ "@types/http-cache-semantics": "*",
2126
+ "@types/keyv": "^3.1.4",
2127
+ "@types/node": "*",
2128
+ "@types/responselike": "^1.0.0"
2129
+ }
2130
+ },
2131
  "node_modules/@types/caseless": {
2132
  "version": "0.12.5",
2133
  "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz",
 
2198
  "@types/node": "*"
2199
  }
2200
  },
2201
+ "node_modules/@types/http-cache-semantics": {
2202
+ "version": "4.0.4",
2203
+ "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
2204
+ "integrity": "sha512-1m0bIFVc7eJWyve9S0RnuRgcQqF/Xd5QsUZAZeQFr1Q3/p9JWoQQEqmVy+DPTNpGXwhgIetAoYF8JSc33q29QA=="
2205
+ },
2206
  "node_modules/@types/http-errors": {
2207
  "version": "2.0.4",
2208
  "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
 
2255
  "@types/node": "*"
2256
  }
2257
  },
2258
+ "node_modules/@types/keyv": {
2259
+ "version": "3.1.4",
2260
+ "resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
2261
+ "integrity": "sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==",
2262
+ "dependencies": {
2263
+ "@types/node": "*"
2264
+ }
2265
+ },
2266
  "node_modules/@types/lodash": {
2267
  "version": "4.17.0",
2268
  "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
 
2344
  "node": ">= 0.12"
2345
  }
2346
  },
2347
+ "node_modules/@types/responselike": {
2348
+ "version": "1.0.3",
2349
+ "resolved": "https://registry.npmjs.org/@types/responselike/-/responselike-1.0.3.tgz",
2350
+ "integrity": "sha512-H/+L+UkTV33uf49PH5pCAUBVPNj2nDBXTN+qS1dOwyyg24l3CcicicCA7ca+HMvJBZcFgl5r8e+RR6elsb4Lyw==",
2351
+ "dependencies": {
2352
+ "@types/node": "*"
2353
+ }
2354
+ },
2355
  "node_modules/@types/semver": {
2356
  "version": "7.5.8",
2357
  "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.8.tgz",
 
2377
  "@types/send": "*"
2378
  }
2379
  },
2380
+ "node_modules/@types/set-cookie-parser": {
2381
+ "version": "2.4.7",
2382
+ "resolved": "https://registry.npmjs.org/@types/set-cookie-parser/-/set-cookie-parser-2.4.7.tgz",
2383
+ "integrity": "sha512-+ge/loa0oTozxip6zmhRIk8Z/boU51wl9Q6QdLZcokIGMzY5lFXYy/x7Htj2HTC6/KZP1hUbZ1ekx8DYXICvWg==",
2384
+ "dev": true,
2385
+ "dependencies": {
2386
+ "@types/node": "*"
2387
+ }
2388
+ },
2389
  "node_modules/@types/stack-utils": {
2390
  "version": "2.0.3",
2391
  "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz",
 
3514
  "node": ">= 6.0.0"
3515
  }
3516
  },
3517
+ "node_modules/cacheable-lookup": {
3518
+ "version": "5.0.4",
3519
+ "resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz",
3520
+ "integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==",
3521
+ "engines": {
3522
+ "node": ">=10.6.0"
3523
+ }
3524
+ },
3525
+ "node_modules/cacheable-request": {
3526
+ "version": "7.0.4",
3527
+ "resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-7.0.4.tgz",
3528
+ "integrity": "sha512-v+p6ongsrp0yTGbJXjgxPow2+DL93DASP4kXCDKb8/bwRtt9OEF3whggkkDkGNzgcWy2XaF4a8nZglC7uElscg==",
3529
+ "dependencies": {
3530
+ "clone-response": "^1.0.2",
3531
+ "get-stream": "^5.1.0",
3532
+ "http-cache-semantics": "^4.0.0",
3533
+ "keyv": "^4.0.0",
3534
+ "lowercase-keys": "^2.0.0",
3535
+ "normalize-url": "^6.0.1",
3536
+ "responselike": "^2.0.0"
3537
+ },
3538
+ "engines": {
3539
+ "node": ">=8"
3540
+ }
3541
+ },
3542
+ "node_modules/cacheable-request/node_modules/get-stream": {
3543
+ "version": "5.2.0",
3544
+ "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
3545
+ "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
3546
+ "dependencies": {
3547
+ "pump": "^3.0.0"
3548
+ },
3549
+ "engines": {
3550
+ "node": ">=8"
3551
+ },
3552
+ "funding": {
3553
+ "url": "https://github.com/sponsors/sindresorhus"
3554
+ }
3555
+ },
3556
  "node_modules/call-bind": {
3557
  "version": "1.0.7",
3558
  "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
 
3748
  "node": ">=0.10.0"
3749
  }
3750
  },
3751
+ "node_modules/clone-response": {
3752
+ "version": "1.0.3",
3753
+ "resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.3.tgz",
3754
+ "integrity": "sha512-ROoL94jJH2dUVML2Y/5PEDNaSHgeOdSDicUyS7izcF63G6sTc/FTjLub4b8Il9S8S0beOfYt0TaA5qvFK+w0wA==",
3755
+ "dependencies": {
3756
+ "mimic-response": "^1.0.0"
3757
+ },
3758
+ "funding": {
3759
+ "url": "https://github.com/sponsors/sindresorhus"
3760
+ }
3761
+ },
3762
+ "node_modules/clone-response/node_modules/mimic-response": {
3763
+ "version": "1.0.1",
3764
+ "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz",
3765
+ "integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ==",
3766
+ "engines": {
3767
+ "node": ">=4"
3768
+ }
3769
+ },
3770
  "node_modules/co": {
3771
  "version": "4.6.0",
3772
  "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
 
4128
  "url": "https://github.com/sponsors/sindresorhus"
4129
  }
4130
  },
4131
+ "node_modules/decompress-response/node_modules/mimic-response": {
4132
+ "version": "3.1.0",
4133
+ "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
4134
+ "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
4135
+ "engines": {
4136
+ "node": ">=10"
4137
+ },
4138
+ "funding": {
4139
+ "url": "https://github.com/sponsors/sindresorhus"
4140
+ }
4141
+ },
4142
  "node_modules/dedent": {
4143
  "version": "1.5.3",
4144
  "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.5.3.tgz",
 
4182
  "node": ">=0.10.0"
4183
  }
4184
  },
4185
+ "node_modules/defer-to-connect": {
4186
+ "version": "2.0.1",
4187
+ "resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz",
4188
+ "integrity": "sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==",
4189
+ "engines": {
4190
+ "node": ">=10"
4191
+ }
4192
+ },
4193
  "node_modules/define-data-property": {
4194
  "version": "1.1.4",
4195
  "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
 
5981
  "url": "https://github.com/sponsors/ljharb"
5982
  }
5983
  },
5984
+ "node_modules/got": {
5985
+ "version": "11.8.6",
5986
+ "resolved": "https://registry.npmjs.org/got/-/got-11.8.6.tgz",
5987
+ "integrity": "sha512-6tfZ91bOr7bOXnK7PRDCGBLa1H4U080YHNaAQ2KsMGlLEzRbk44nsZF2E1IeRc3vtJHPVbKCYgdFbaGO2ljd8g==",
5988
+ "dependencies": {
5989
+ "@sindresorhus/is": "^4.0.0",
5990
+ "@szmarczak/http-timer": "^4.0.5",
5991
+ "@types/cacheable-request": "^6.0.1",
5992
+ "@types/responselike": "^1.0.0",
5993
+ "cacheable-lookup": "^5.0.3",
5994
+ "cacheable-request": "^7.0.2",
5995
+ "decompress-response": "^6.0.0",
5996
+ "http2-wrapper": "^1.0.0-beta.5.2",
5997
+ "lowercase-keys": "^2.0.0",
5998
+ "p-cancelable": "^2.0.0",
5999
+ "responselike": "^2.0.0"
6000
+ },
6001
+ "engines": {
6002
+ "node": ">=10.19.0"
6003
+ },
6004
+ "funding": {
6005
+ "url": "https://github.com/sindresorhus/got?sponsor=1"
6006
+ }
6007
+ },
6008
  "node_modules/graceful-fs": {
6009
  "version": "4.2.11",
6010
  "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
 
6192
  "node": ">= 0.6"
6193
  }
6194
  },
6195
+ "node_modules/http-cache-semantics": {
6196
+ "version": "4.1.1",
6197
+ "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz",
6198
+ "integrity": "sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ=="
6199
+ },
6200
  "node_modules/http-errors": {
6201
  "version": "2.0.0",
6202
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
 
6240
  "node": ">= 14"
6241
  }
6242
  },
6243
+ "node_modules/http2-wrapper": {
6244
+ "version": "1.0.3",
6245
+ "resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz",
6246
+ "integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==",
6247
+ "dependencies": {
6248
+ "quick-lru": "^5.1.1",
6249
+ "resolve-alpn": "^1.0.0"
6250
+ },
6251
+ "engines": {
6252
+ "node": ">=10.19.0"
6253
+ }
6254
+ },
6255
  "node_modules/https-proxy-agent": {
6256
  "version": "5.0.1",
6257
  "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
 
7576
  "node_modules/json-buffer": {
7577
  "version": "3.0.1",
7578
  "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
7579
+ "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="
 
7580
  },
7581
  "node_modules/json-parse-even-better-errors": {
7582
  "version": "2.3.1",
 
7734
  "version": "4.5.4",
7735
  "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
7736
  "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
 
7737
  "dependencies": {
7738
  "json-buffer": "3.0.1"
7739
  }
 
8052
  "integrity": "sha512-BFRuQUqc7x2NWxfJBCyUrN8iYUYznzL9JROmRz1gZ6KlOIgmoD+njPVbb+VNn2nGMKggMsK79iUNErillsrx7w==",
8053
  "optional": true
8054
  },
8055
+ "node_modules/lowercase-keys": {
8056
+ "version": "2.0.0",
8057
+ "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-2.0.0.tgz",
8058
+ "integrity": "sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==",
8059
+ "engines": {
8060
+ "node": ">=8"
8061
+ }
8062
+ },
8063
  "node_modules/lru-cache": {
8064
  "version": "5.1.1",
8065
  "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
 
8245
  "node": ">=6"
8246
  }
8247
  },
 
 
 
 
 
 
 
 
 
 
 
8248
  "node_modules/minimatch": {
8249
  "version": "3.1.2",
8250
  "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
 
8603
  "node": ">=0.10.0"
8604
  }
8605
  },
8606
+ "node_modules/normalize-url": {
8607
+ "version": "6.1.0",
8608
+ "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-6.1.0.tgz",
8609
+ "integrity": "sha512-DlL+XwOy3NxAQ8xuC0okPgK46iuVNAK01YN7RueYBqqFeGsBjV9XmCAzAdgt+667bCl5kPh9EqKKDwnaPG1I7A==",
8610
+ "engines": {
8611
+ "node": ">=10"
8612
+ },
8613
+ "funding": {
8614
+ "url": "https://github.com/sponsors/sindresorhus"
8615
+ }
8616
+ },
8617
  "node_modules/npm-run-path": {
8618
  "version": "4.0.1",
8619
  "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
 
8823
  "node": ">= 0.8.0"
8824
  }
8825
  },
8826
+ "node_modules/p-cancelable": {
8827
+ "version": "2.1.1",
8828
+ "resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
8829
+ "integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==",
8830
+ "engines": {
8831
+ "node": ">=8"
8832
+ }
8833
+ },
8834
  "node_modules/p-limit": {
8835
  "version": "3.1.0",
8836
  "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
 
9442
  "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz",
9443
  "integrity": "sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ=="
9444
  },
9445
+ "node_modules/psl": {
9446
+ "version": "1.9.0",
9447
+ "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz",
9448
+ "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag=="
9449
+ },
9450
  "node_modules/pump": {
9451
  "version": "3.0.0",
9452
  "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
 
9571
  }
9572
  }
9573
  },
9574
+ "node_modules/puppeteer-extra-plugin-page-proxy": {
9575
+ "version": "2.0.0",
9576
+ "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz",
9577
+ "integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==",
9578
+ "dependencies": {
9579
+ "debug": "^4.1.1",
9580
+ "got": "^11.8.5",
9581
+ "http-proxy-agent": "^5.0.0",
9582
+ "https-proxy-agent": "^5.0.1",
9583
+ "puppeteer-extra-plugin": "^3.2.3",
9584
+ "socks-proxy-agent": "^7.0.0",
9585
+ "tough-cookie": "^4.1.2"
9586
+ },
9587
+ "peerDependencies": {
9588
+ "playwright-extra": "*",
9589
+ "puppeteer-extra": "*"
9590
+ },
9591
+ "peerDependenciesMeta": {
9592
+ "playwright-extra": {
9593
+ "optional": true
9594
+ },
9595
+ "puppeteer-extra": {
9596
+ "optional": true
9597
+ }
9598
+ }
9599
+ },
9600
+ "node_modules/puppeteer-extra-plugin-page-proxy/node_modules/http-proxy-agent": {
9601
+ "version": "5.0.0",
9602
+ "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
9603
+ "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
9604
+ "dependencies": {
9605
+ "@tootallnate/once": "2",
9606
+ "agent-base": "6",
9607
+ "debug": "4"
9608
+ },
9609
+ "engines": {
9610
+ "node": ">= 6"
9611
+ }
9612
+ },
9613
+ "node_modules/puppeteer-extra-plugin-page-proxy/node_modules/socks-proxy-agent": {
9614
+ "version": "7.0.0",
9615
+ "resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz",
9616
+ "integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==",
9617
+ "dependencies": {
9618
+ "agent-base": "^6.0.2",
9619
+ "debug": "^4.3.3",
9620
+ "socks": "^2.6.2"
9621
+ },
9622
+ "engines": {
9623
+ "node": ">= 10"
9624
+ }
9625
+ },
9626
  "node_modules/puppeteer-extra-plugin-stealth": {
9627
  "version": "2.11.2",
9628
  "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
 
9713
  }
9714
  }
9715
  },
9716
+ "node_modules/puppeteer-page-proxy": {
9717
+ "version": "1.3.0",
9718
+ "resolved": "https://registry.npmjs.org/puppeteer-page-proxy/-/puppeteer-page-proxy-1.3.0.tgz",
9719
+ "integrity": "sha512-PDpLjJfcUKiLvUZ3yQJeUcP1d+7nW17s2LZIrKH0gyxEN4zTGkCvB9/HwquPgYq5YcVi8QugsvBckP/K9Vn/iw==",
9720
+ "dependencies": {
9721
+ "got": "^11.8.5",
9722
+ "http-proxy-agent": "^5.0.0",
9723
+ "https-proxy-agent": "^5.0.1",
9724
+ "socks-proxy-agent": "^7.0.0",
9725
+ "tough-cookie": "^4.1.2"
9726
+ }
9727
+ },
9728
+ "node_modules/puppeteer-page-proxy/node_modules/http-proxy-agent": {
9729
+ "version": "5.0.0",
9730
+ "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
9731
+ "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
9732
+ "dependencies": {
9733
+ "@tootallnate/once": "2",
9734
+ "agent-base": "6",
9735
+ "debug": "4"
9736
+ },
9737
+ "engines": {
9738
+ "node": ">= 6"
9739
+ }
9740
+ },
9741
+ "node_modules/puppeteer-page-proxy/node_modules/socks-proxy-agent": {
9742
+ "version": "7.0.0",
9743
+ "resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz",
9744
+ "integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==",
9745
+ "dependencies": {
9746
+ "agent-base": "^6.0.2",
9747
+ "debug": "^4.3.3",
9748
+ "socks": "^2.6.2"
9749
+ },
9750
+ "engines": {
9751
+ "node": ">= 10"
9752
+ }
9753
+ },
9754
  "node_modules/pure-rand": {
9755
  "version": "6.1.0",
9756
  "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz",
 
9817
  "url": "https://github.com/sponsors/sindresorhus"
9818
  }
9819
  },
9820
+ "node_modules/querystringify": {
9821
+ "version": "2.2.0",
9822
+ "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz",
9823
+ "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ=="
9824
+ },
9825
  "node_modules/queue-microtask": {
9826
  "version": "1.2.3",
9827
  "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
 
9853
  "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
9854
  "optional": true
9855
  },
9856
+ "node_modules/quick-lru": {
9857
+ "version": "5.1.1",
9858
+ "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
9859
+ "integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
9860
+ "engines": {
9861
+ "node": ">=10"
9862
+ },
9863
+ "funding": {
9864
+ "url": "https://github.com/sponsors/sindresorhus"
9865
+ }
9866
+ },
9867
  "node_modules/range-parser": {
9868
  "version": "1.2.1",
9869
  "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
 
10019
  "node": ">=0.10.0"
10020
  }
10021
  },
10022
+ "node_modules/requires-port": {
10023
+ "version": "1.0.0",
10024
+ "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
10025
+ "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="
10026
+ },
10027
  "node_modules/resolve": {
10028
  "version": "1.22.8",
10029
  "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz",
 
10041
  "url": "https://github.com/sponsors/ljharb"
10042
  }
10043
  },
10044
+ "node_modules/resolve-alpn": {
10045
+ "version": "1.2.1",
10046
+ "resolved": "https://registry.npmjs.org/resolve-alpn/-/resolve-alpn-1.2.1.tgz",
10047
+ "integrity": "sha512-0a1F4l73/ZFZOakJnQ3FvkJ2+gSTQWz/r2KE5OdDY0TxPm5h4GkqkWWfM47T7HsbnOtcJVEF4epCVy6u7Q3K+g=="
10048
+ },
10049
  "node_modules/resolve-cwd": {
10050
  "version": "3.0.0",
10051
  "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz",
 
10087
  "node": ">=10"
10088
  }
10089
  },
10090
+ "node_modules/responselike": {
10091
+ "version": "2.0.1",
10092
+ "resolved": "https://registry.npmjs.org/responselike/-/responselike-2.0.1.tgz",
10093
+ "integrity": "sha512-4gl03wn3hj1HP3yzgdI7d3lCkF95F21Pz4BPGvKHinyQzALR5CapwC8yIi0Rh58DEMQ/SguC03wFj2k0M/mHhw==",
10094
+ "dependencies": {
10095
+ "lowercase-keys": "^2.0.0"
10096
+ },
10097
+ "funding": {
10098
+ "url": "https://github.com/sponsors/sindresorhus"
10099
+ }
10100
+ },
10101
  "node_modules/retry": {
10102
  "version": "0.13.1",
10103
  "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz",
 
10360
  "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
10361
  "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw=="
10362
  },
10363
+ "node_modules/set-cookie-parser": {
10364
+ "version": "2.6.0",
10365
+ "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.6.0.tgz",
10366
+ "integrity": "sha512-RVnVQxTXuerk653XfuliOxBP81Sf0+qfQE73LIYKcyMYHG94AuH0kgrQpRDuTZnSmjpysHmzxJXKNfa6PjFhyQ=="
10367
+ },
10368
  "node_modules/set-function-length": {
10369
  "version": "1.2.2",
10370
  "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
 
11103
  "node": ">=0.6"
11104
  }
11105
  },
11106
+ "node_modules/tough-cookie": {
11107
+ "version": "4.1.3",
11108
+ "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz",
11109
+ "integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==",
11110
+ "dependencies": {
11111
+ "psl": "^1.1.33",
11112
+ "punycode": "^2.1.1",
11113
+ "universalify": "^0.2.0",
11114
+ "url-parse": "^1.5.3"
11115
+ },
11116
+ "engines": {
11117
+ "node": ">=6"
11118
+ }
11119
+ },
11120
+ "node_modules/tough-cookie/node_modules/universalify": {
11121
+ "version": "0.2.0",
11122
+ "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
11123
+ "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==",
11124
+ "engines": {
11125
+ "node": ">= 4.0.0"
11126
+ }
11127
+ },
11128
  "node_modules/tr46": {
11129
  "version": "3.0.0",
11130
  "resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz",
 
11485
  "punycode": "^2.1.0"
11486
  }
11487
  },
11488
+ "node_modules/url-parse": {
11489
+ "version": "1.5.10",
11490
+ "resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz",
11491
+ "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
11492
+ "dependencies": {
11493
+ "querystringify": "^2.1.1",
11494
+ "requires-port": "^1.0.0"
11495
+ }
11496
+ },
11497
  "node_modules/urlpattern-polyfill": {
11498
  "version": "10.0.0",
11499
  "resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
backend/functions/package.json CHANGED
@@ -49,7 +49,10 @@
49
  "puppeteer": "^22.6.3",
50
  "puppeteer-extra": "^3.3.6",
51
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
 
52
  "puppeteer-extra-plugin-stealth": "^2.11.2",
 
 
53
  "stripe": "^11.11.0",
54
  "tiktoken": "^1.0.10",
55
  "turndown": "^7.1.3",
@@ -62,6 +65,7 @@
62
  "@types/cors": "^2.8.17",
63
  "@types/generic-pool": "^3.8.1",
64
  "@types/node": "^18",
 
65
  "@typescript-eslint/eslint-plugin": "^5.12.0",
66
  "@typescript-eslint/parser": "^5.12.0",
67
  "eslint": "^8.9.0",
 
49
  "puppeteer": "^22.6.3",
50
  "puppeteer-extra": "^3.3.6",
51
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
52
+ "puppeteer-extra-plugin-page-proxy": "^2.0.0",
53
  "puppeteer-extra-plugin-stealth": "^2.11.2",
54
+ "puppeteer-page-proxy": "^1.3.0",
55
+ "set-cookie-parser": "^2.6.0",
56
  "stripe": "^11.11.0",
57
  "tiktoken": "^1.0.10",
58
  "turndown": "^7.1.3",
 
65
  "@types/cors": "^2.8.17",
66
  "@types/generic-pool": "^3.8.1",
67
  "@types/node": "^18",
68
+ "@types/set-cookie-parser": "^2.4.7",
69
  "@typescript-eslint/eslint-plugin": "^5.12.0",
70
  "@typescript-eslint/parser": "^5.12.0",
71
  "eslint": "^8.9.0",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -1,51 +1,25 @@
1
- import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError, ParamValidationError } from 'civkit';
 
 
 
 
 
2
  import { singleton } from 'tsyringe';
3
- import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
4
  import _ from 'lodash';
5
- import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
6
  import { Request, Response } from 'express';
7
  import normalizeUrl from "@esm2cjs/normalize-url";
8
  import { AltTextService } from '../services/alt-text';
9
  import TurndownService from 'turndown';
 
 
 
 
 
 
10
 
11
- function tidyMarkdown(markdown: string): string {
12
-
13
- // Step 1: Handle complex broken links with text and optional images spread across multiple lines
14
- let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
15
- // Remove internal new lines and excessive spaces within the text
16
- text = text.replace(/\s+/g, ' ').trim();
17
- url = url.replace(/\s+/g, '').trim();
18
- return `[${text}](${url})`;
19
- });
20
-
21
- normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
22
- // Normalize by removing excessive spaces and new lines
23
- text = text.replace(/\s+/g, ' ').trim();
24
- alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
25
- imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
26
- linkUrl = linkUrl.replace(/\s+/g, '').trim();
27
- if (imgUrl) {
28
- return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
29
- } else {
30
- return `[${text}](${linkUrl})`;
31
- }
32
- });
33
-
34
- // Step 2: Normalize regular links that may be broken across lines
35
- normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
36
- text = text.replace(/\s+/g, ' ').trim();
37
- url = url.replace(/\s+/g, '').trim();
38
- return `[${text}](${url})`;
39
- });
40
-
41
- // Step 3: Replace more than two consecutive empty lines with exactly two empty lines
42
- normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
43
-
44
- // Step 4: Remove leading spaces from each line
45
- normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
46
-
47
- return normalizedMarkdown.trim();
48
- }
49
 
50
  @singleton()
51
  export class CrawlerHost extends RPCHost {
@@ -53,12 +27,29 @@ export class CrawlerHost extends RPCHost {
53
 
54
  turnDownPlugins = [require('turndown-plugin-gfm').tables];
55
 
 
 
 
 
56
  constructor(
57
  protected globalLogger: Logger,
58
  protected puppeteerControl: PuppeteerControl,
59
  protected altTextService: AltTextService,
 
60
  ) {
61
  super(...arguments);
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
 
64
  override async init() {
@@ -67,16 +58,51 @@ export class CrawlerHost extends RPCHost {
67
  this.emit('ready');
68
  }
69
 
70
- async formatSnapshot(snapshot: PageSnapshot, nominalUrl?: string) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  const toBeTurnedToMd = snapshot.parsed?.content;
72
  let turnDownService = new TurndownService();
73
  for (const plugin of this.turnDownPlugins) {
74
  turnDownService = turnDownService.use(plugin);
75
  }
76
-
77
- let contentText = '';
78
- if (toBeTurnedToMd) {
79
- const urlToAltMap: { [k: string]: string | undefined; } = {};
80
  const tasks = (snapshot.imgs || []).map(async (x) => {
81
  const r = await this.altTextService.getAltText(x).catch((err: any) => {
82
  this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
@@ -88,25 +114,27 @@ export class CrawlerHost extends RPCHost {
88
  });
89
 
90
  await Promise.all(tasks);
91
- let imgIdx = 0;
92
-
93
- turnDownService.addRule('img-generated-alt', {
94
- filter: 'img',
95
- replacement: (_content, node) => {
96
- const src = (node.getAttribute('src') || '').trim();
97
- const alt = cleanAttribute(node.getAttribute('alt'));
98
- if (!src) {
99
- return '';
100
- }
101
- const mapped = urlToAltMap[src];
102
- imgIdx++;
103
- if (mapped) {
104
- return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
105
- }
106
- return `![Image ${imgIdx}: ${alt}](${src})`;
107
  }
108
- });
 
 
 
 
 
 
 
109
 
 
 
110
  try {
111
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
112
  } catch (err) {
@@ -141,7 +169,7 @@ export class CrawlerHost extends RPCHost {
141
 
142
  const formatted = {
143
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
144
- url: nominalUrl || snapshot.href?.trim(),
145
  content: cleanText,
146
  publishedTime: snapshot.parsed?.publishedTime || undefined,
147
 
@@ -171,6 +199,7 @@ ${this.content}
171
  timeoutSeconds: 540,
172
  concurrency: 4,
173
  },
 
174
  httpMethod: ['get', 'post'],
175
  returnType: [String, OutputServerEventStream],
176
  })
@@ -181,6 +210,57 @@ ${this.content}
181
  concurrency: 21,
182
  maxInstances: 476,
183
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  httpMethod: ['get', 'post'],
185
  returnType: [String, OutputServerEventStream],
186
  })
@@ -207,27 +287,41 @@ ${this.content}
207
  path: 'url'
208
  });
209
  }
210
- const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
211
- const noCache = Boolean(ctx.req.headers['x-no-cache']);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
214
  const sseStream = new OutputServerEventStream();
215
  rpcReflect.return(sseStream);
216
 
217
  try {
218
- for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
219
  if (!scrapped) {
220
  continue;
221
  }
222
 
223
- const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
224
-
225
- if (scrapped.screenshot && screenshotEnabled) {
226
- sseStream.write({
227
- event: 'screenshot',
228
- data: scrapped.screenshot.toString('base64'),
229
- });
230
- }
231
 
232
  sseStream.write({
233
  event: 'data',
@@ -235,7 +329,7 @@ ${this.content}
235
  });
236
  }
237
  } catch (err: any) {
238
- this.logger.error(`Failed to crawl ${urlToCrawl.toString()}`, { err: marshalErrorLike(err) });
239
  sseStream.write({
240
  event: 'error',
241
  data: marshalErrorLike(err),
@@ -249,13 +343,13 @@ ${this.content}
249
 
250
  let lastScrapped;
251
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
252
- for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
253
  lastScrapped = scrapped;
254
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
255
  continue;
256
  }
257
 
258
- const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
259
 
260
  return formatted;
261
  }
@@ -264,16 +358,22 @@ ${this.content}
264
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
265
  }
266
 
267
- return await this.formatSnapshot(lastScrapped, urlToCrawl?.toString());
268
  }
269
 
270
- for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
271
  lastScrapped = scrapped;
272
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
273
  continue;
274
  }
275
 
276
- const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
 
 
 
 
 
 
277
 
278
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
279
  }
@@ -282,12 +382,111 @@ ${this.content}
282
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
283
  }
284
 
285
- return `${await this.formatSnapshot(lastScrapped, urlToCrawl?.toString())}`;
 
 
 
 
 
 
 
 
286
  }
287
 
 
 
 
 
 
288
 
289
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
- function cleanAttribute(attribute: string) {
292
- return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
293
  }
 
1
+ import {
2
+ assignTransferProtocolMeta, marshalErrorLike,
3
+ RPCHost, RPCReflection,
4
+ HashManager,
5
+ AssertionFailureError, ParamValidationError,
6
+ } from 'civkit';
7
  import { singleton } from 'tsyringe';
8
+ import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
9
  import _ from 'lodash';
10
+ import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
11
  import { Request, Response } from 'express';
12
  import normalizeUrl from "@esm2cjs/normalize-url";
13
  import { AltTextService } from '../services/alt-text';
14
  import TurndownService from 'turndown';
15
+ import { parseString as parseSetCookieString } from 'set-cookie-parser';
16
+ import { CookieParam } from 'puppeteer';
17
+ import { Crawled } from '../db/crawled';
18
+ import { tidyMarkdown } from '../utils/markdown';
19
+ import { cleanAttribute } from '../utils/misc';
20
+ import { randomUUID } from 'crypto';
21
 
22
+ const md5Hasher = new HashManager('md5', 'hex');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  @singleton()
25
  export class CrawlerHost extends RPCHost {
 
27
 
28
  turnDownPlugins = [require('turndown-plugin-gfm').tables];
29
 
30
+ cacheRetentionMs = 1000 * 3600 * 24 * 7;
31
+ cacheValidMs = 1000 * 300;
32
+ urlValidMs = 1000 * 3600 * 4;
33
+
34
  constructor(
35
  protected globalLogger: Logger,
36
  protected puppeteerControl: PuppeteerControl,
37
  protected altTextService: AltTextService,
38
+ protected firebaseObjectStorage: FirebaseStorageBucketControl,
39
  ) {
40
  super(...arguments);
41
+
42
+ puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
43
+ if (!snapshot.title?.trim()) {
44
+ return;
45
+ }
46
+ if (options.cookies?.length) {
47
+ // Potential privacy issue, dont cache if cookies are used
48
+ return;
49
+ }
50
+
51
+ await this.setToCache(options.url, snapshot);
52
+ });
53
  }
54
 
55
  override async init() {
 
58
  this.emit('ready');
59
  }
60
 
61
+ async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
62
+ screenshotUrl?: string;
63
+ }, nominalUrl?: URL) {
64
+ if (mode === 'screenshot') {
65
+ if (snapshot.screenshot && !snapshot.screenshotUrl) {
66
+ const fid = `instant-screenshots/${randomUUID()}`;
67
+ await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
68
+ metadata: {
69
+ contentType: 'image/png',
70
+ }
71
+ });
72
+ snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
73
+ }
74
+
75
+ return {
76
+ screenshotUrl: snapshot.screenshotUrl,
77
+ toString() {
78
+ return this.screenshotUrl;
79
+ }
80
+ };
81
+ }
82
+ if (mode === 'html') {
83
+ return {
84
+ html: snapshot.html,
85
+ toString() {
86
+ return this.html;
87
+ }
88
+ };
89
+ }
90
+ if (mode === 'text') {
91
+ return {
92
+ text: snapshot.text,
93
+ toString() {
94
+ return this.text;
95
+ }
96
+ };
97
+ }
98
+
99
  const toBeTurnedToMd = snapshot.parsed?.content;
100
  let turnDownService = new TurndownService();
101
  for (const plugin of this.turnDownPlugins) {
102
  turnDownService = turnDownService.use(plugin);
103
  }
104
+ const urlToAltMap: { [k: string]: string | undefined; } = {};
105
+ if (snapshot.imgs?.length) {
 
 
106
  const tasks = (snapshot.imgs || []).map(async (x) => {
107
  const r = await this.altTextService.getAltText(x).catch((err: any) => {
108
  this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
 
114
  });
115
 
116
  await Promise.all(tasks);
117
+ }
118
+ let imgIdx = 0;
119
+ turnDownService.addRule('img-generated-alt', {
120
+ filter: 'img',
121
+ replacement: (_content, node) => {
122
+ const src = (node.getAttribute('src') || '').trim();
123
+ const alt = cleanAttribute(node.getAttribute('alt'));
124
+ if (!src) {
125
+ return '';
 
 
 
 
 
 
 
126
  }
127
+ const mapped = urlToAltMap[src];
128
+ imgIdx++;
129
+ if (mapped) {
130
+ return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
131
+ }
132
+ return `![Image ${imgIdx}: ${alt}](${src})`;
133
+ }
134
+ });
135
 
136
+ let contentText = '';
137
+ if (toBeTurnedToMd) {
138
  try {
139
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
140
  } catch (err) {
 
169
 
170
  const formatted = {
171
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
172
+ url: nominalUrl?.toString() || snapshot.href?.trim(),
173
  content: cleanText,
174
  publishedTime: snapshot.parsed?.publishedTime || undefined,
175
 
 
199
  timeoutSeconds: 540,
200
  concurrency: 4,
201
  },
202
+ tags: ['Crawler'],
203
  httpMethod: ['get', 'post'],
204
  returnType: [String, OutputServerEventStream],
205
  })
 
210
  concurrency: 21,
211
  maxInstances: 476,
212
  },
213
+ openapi: {
214
+ operation: {
215
+ parameters: {
216
+ 'Accept': {
217
+ description: `Specifies your preference for the response format. \n\n` +
218
+ `Supported formats:\n` +
219
+ `- text/event-stream\n` +
220
+ `- application/json or text/json\n` +
221
+ `- text/plain`
222
+ ,
223
+ in: 'header',
224
+ schema: { type: 'string' }
225
+ },
226
+ 'X-No-Cache': {
227
+ description: `Ignores internal cache if this header is specified with a value.`,
228
+ in: 'header',
229
+ schema: { type: 'string' }
230
+ },
231
+ 'X-Respond-With': {
232
+ description: `Specifies the form factor of the crawled data you prefer. \n\n` +
233
+ `Supported formats:\n` +
234
+ `- markdown\n` +
235
+ `- html\n` +
236
+ `- text\n` +
237
+ `- screenshot\n\n` +
238
+ `Defaults to: markdown`
239
+ ,
240
+ in: 'header',
241
+ schema: { type: 'string' }
242
+ },
243
+ 'X-Proxy-Url': {
244
+ description: `Specifies your custom proxy if you prefer to use one. \n\n` +
245
+ `Supported protocols:\n` +
246
+ `- http\n` +
247
+ `- https\n` +
248
+ `- socks4\n` +
249
+ `- socks5\n\n` +
250
+ `For authentication, https://user:pass@host:port`,
251
+ in: 'header',
252
+ schema: { type: 'string' }
253
+ },
254
+ 'X-Set-Cookie': {
255
+ description: `Sets cookie(s) to the headless browser for your request. \n\n` +
256
+ `Syntax is the same with standard Set-Cookie`,
257
+ in: 'header',
258
+ schema: { type: 'string' }
259
+ },
260
+ }
261
+ }
262
+ },
263
+ tags: ['Crawler'],
264
  httpMethod: ['get', 'post'],
265
  returnType: [String, OutputServerEventStream],
266
  })
 
287
  path: 'url'
288
  });
289
  }
290
+
291
+ const customMode = ctx.req.get('x-respond-with') || 'markdown';
292
+ const noCache = Boolean(ctx.req.get('x-no-cache'));
293
+ const cookies: CookieParam[] = [];
294
+ const setCookieHeaders = ctx.req.headers['x-set-cookie'];
295
+ if (Array.isArray(setCookieHeaders)) {
296
+ for (const setCookie of setCookieHeaders) {
297
+ cookies.push({
298
+ ...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
299
+ domain: urlToCrawl.hostname,
300
+ });
301
+ }
302
+ } else if (setCookieHeaders) {
303
+ cookies.push({
304
+ ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
305
+ domain: urlToCrawl.hostname,
306
+ });
307
+ }
308
+
309
+ const crawlOpts: ScrappingOptions = {
310
+ proxyUrl: ctx.req.get('x-proxy-url'),
311
+ cookies,
312
+ };
313
 
314
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
315
  const sseStream = new OutputServerEventStream();
316
  rpcReflect.return(sseStream);
317
 
318
  try {
319
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
320
  if (!scrapped) {
321
  continue;
322
  }
323
 
324
+ const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
 
 
 
 
 
 
 
325
 
326
  sseStream.write({
327
  event: 'data',
 
329
  });
330
  }
331
  } catch (err: any) {
332
+ this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
333
  sseStream.write({
334
  event: 'error',
335
  data: marshalErrorLike(err),
 
343
 
344
  let lastScrapped;
345
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
346
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
347
  lastScrapped = scrapped;
348
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
349
  continue;
350
  }
351
 
352
+ const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
353
 
354
  return formatted;
355
  }
 
358
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
359
  }
360
 
361
+ return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
362
  }
363
 
364
+ for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
365
  lastScrapped = scrapped;
366
  if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
367
  continue;
368
  }
369
 
370
+ const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
371
+ if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
372
+
373
+ return assignTransferProtocolMeta(`${formatted}`,
374
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
375
+ );
376
+ }
377
 
378
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
379
  }
 
382
  throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
383
  }
384
 
385
+ const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
386
+ if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
387
+
388
+ return assignTransferProtocolMeta(`${formatted}`,
389
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
390
+ );
391
+ }
392
+
393
+ return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
394
  }
395
 
396
+ getUrlDigest(urlToCrawl: URL) {
397
+ const normalizedURL = new URL(urlToCrawl);
398
+ normalizedURL.hash = '';
399
+ const normalizedUrl = normalizedURL.toString().toLowerCase();
400
+ const digest = md5Hasher.hash(normalizedUrl.toString());
401
 
402
+ return digest;
403
+ }
404
+
405
+ async queryCache(urlToCrawl: URL) {
406
+ const digest = this.getUrlDigest(urlToCrawl);
407
+
408
+ const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
409
+
410
+ if (cache) {
411
+ const age = Date.now() - cache.createdAt.valueOf();
412
+ const stale = cache.createdAt.valueOf() > (Date.now() - this.cacheValidMs);
413
+ this.logger.info(`${stale ? 'Only stale ' : ''}Cache exists for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
414
+ url: urlToCrawl, digest, age, stale
415
+ });
416
+
417
+ const r = cache.snapshot;
418
+
419
+ return {
420
+ isFresh: !stale,
421
+ snapshot: {
422
+ ...r,
423
+ screenshot: undefined,
424
+ screenshotUrl: cache.screenshotAvailable ?
425
+ await this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs) : undefined,
426
+ } as PageSnapshot & { screenshotUrl?: string; }
427
+ };
428
+ }
429
+
430
+ return undefined;
431
+ }
432
+
433
+ async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
434
+ const digest = this.getUrlDigest(urlToCrawl);
435
+
436
+ this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
437
+ const nowDate = new Date();
438
+
439
+ const cache = Crawled.from({
440
+ _id: randomUUID(),
441
+ url: urlToCrawl.toString(),
442
+ createdAt: nowDate,
443
+ expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
444
+ urlPathDigest: digest,
445
+ snapshot: {
446
+ ...snapshot,
447
+ screenshot: null
448
+ },
449
+ });
450
+
451
+ if (snapshot.screenshot) {
452
+ await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
453
+ metadata: {
454
+ contentType: 'image/png',
455
+ }
456
+ });
457
+ cache.screenshotAvailable = true;
458
+ }
459
+ const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
460
+ this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
461
+
462
+ return undefined;
463
+ });
464
+
465
+ return r;
466
+ }
467
+
468
+ async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
469
+ let cache;
470
+ if (!noCache && !crawlOpts.cookies?.length) {
471
+ cache = await this.queryCache(urlToCrawl);
472
+ }
473
+
474
+ if (cache?.isFresh) {
475
+ yield cache.snapshot;
476
+
477
+ return;
478
+ }
479
+
480
+ try {
481
+ yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
482
+ } catch (err: any) {
483
+ if (cache) {
484
+ this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
485
+ yield cache.snapshot;
486
+ return;
487
+ }
488
+ throw err;
489
+ }
490
+ }
491
 
 
 
492
  }
backend/functions/src/db/crawled.ts CHANGED
@@ -1,6 +1,7 @@
1
  import { Also, parseJSONText, Prop } from 'civkit';
2
  import { FirestoreRecord } from '../shared/lib/firestore';
3
  import _ from 'lodash';
 
4
 
5
  @Also({
6
  dictOf: Object
@@ -21,7 +22,10 @@ export class Crawled extends FirestoreRecord {
21
  urlPathDigest!: string;
22
 
23
  @Prop()
24
- snapshot!: any;
 
 
 
25
 
26
  @Prop()
27
  createdAt!: Date;
 
1
  import { Also, parseJSONText, Prop } from 'civkit';
2
  import { FirestoreRecord } from '../shared/lib/firestore';
3
  import _ from 'lodash';
4
+ import type { PageSnapshot } from '../services/puppeteer';
5
 
6
  @Also({
7
  dictOf: Object
 
22
  urlPathDigest!: string;
23
 
24
  @Prop()
25
+ snapshot!: PageSnapshot & { screenshot: never; };
26
+
27
+ @Prop()
28
+ screenshotAvailable?: boolean;
29
 
30
  @Prop()
31
  createdAt!: Date;
backend/functions/src/services/puppeteer.ts CHANGED
@@ -1,13 +1,17 @@
1
- import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
2
- import { container, singleton } from 'tsyringe';
3
- import type { Browser, Page } from 'puppeteer';
4
- import { Logger } from '../shared/services/logger';
5
- import genericPool from 'generic-pool';
6
  import os from 'os';
7
  import fs from 'fs';
8
- import { Crawled } from '../db/crawled';
 
 
 
 
 
9
  import puppeteer from 'puppeteer-extra';
10
 
 
 
 
 
11
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
12
 
13
  export interface ImgBrief {
@@ -42,7 +46,12 @@ export interface PageSnapshot {
42
  screenshot?: Buffer;
43
  imgs?: ImgBrief[];
44
  }
45
- const md5Hasher = new HashManager('md5', 'hex');
 
 
 
 
 
46
 
47
  const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
48
  puppeteer.use(puppeteerStealth());
@@ -51,9 +60,13 @@ puppeteer.use(puppeteerStealth());
51
  // userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
52
  // platform: `Linux`,
53
  // }))
54
- const puppeteerBlockResources = require('puppeteer-extra-plugin-block-resources');
55
  puppeteer.use(puppeteerBlockResources({
56
  blockedTypes: new Set(['media']),
 
 
 
 
57
  }));
58
 
59
  @singleton()
@@ -74,7 +87,7 @@ export class PuppeteerControl extends AsyncService {
74
  return page.browser().connected && !page.isClosed();
75
  }
76
  }, {
77
- max: Math.max(1 + Math.floor(os.freemem() / (1024 * 1024 * 1024)), 16),
78
  min: 1,
79
  acquireTimeoutMillis: 60_000,
80
  testOnBorrow: true,
@@ -88,7 +101,7 @@ export class PuppeteerControl extends AsyncService {
88
 
89
  override async init() {
90
  await this.dependencyReady();
91
-
92
  this.pagePool.start();
93
 
94
  if (this.browser) {
@@ -128,7 +141,10 @@ export class PuppeteerControl extends AsyncService {
128
  // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
129
  preparations.push(page.setBypassCSP(true));
130
  preparations.push(page.setViewport({ width: 1024, height: 1024 }));
131
- preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
 
 
 
132
  page.emit('snapshot', snapshot);
133
  }));
134
  preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
@@ -166,40 +182,39 @@ function giveSnapshot() {
166
  const elem = document.createElement('div');
167
  elem.innerHTML = parsed.content;
168
  r.imgs = briefImgs(elem);
 
 
 
 
 
169
  }
170
 
171
  return r;
172
  }
173
  `));
174
- preparations.push(page.evaluateOnNewDocument(() => {
175
- let aftershot: any;
176
- const handlePageLoad = () => {
177
- // @ts-expect-error
178
- if (document.readyState !== 'complete' && document.readyState !== 'interactive') {
179
- return;
180
- }
181
- // @ts-expect-error
182
- const parsed = giveSnapshot();
183
- if (parsed) {
184
- // @ts-expect-error
185
- window.reportSnapshot(parsed);
186
- } else {
187
- if (aftershot) {
188
- clearTimeout(aftershot);
189
- }
190
- aftershot = setTimeout(() => {
191
- // @ts-expect-error
192
- window.reportSnapshot(giveSnapshot());
193
- }, 500);
194
- }
195
- };
196
- // setInterval(handlePageLoad, 1000);
197
- // @ts-expect-error
198
- document.addEventListener('readystatechange', handlePageLoad);
199
- // @ts-expect-error
200
- document.addEventListener('load', handlePageLoad);
201
- }));
202
-
203
  await Promise.all(preparations);
204
 
205
  // TODO: further setup the page;
@@ -207,41 +222,23 @@ function giveSnapshot() {
207
  return page;
208
  }
209
 
210
- async *scrap(url: string, noCache: string | boolean = false): AsyncGenerator<PageSnapshot | undefined> {
211
- const parsedUrl = new URL(url);
212
  // parsedUrl.search = '';
213
- parsedUrl.hash = '';
214
- const normalizedUrl = parsedUrl.toString().toLowerCase();
215
- const digest = md5Hasher.hash(normalizedUrl);
216
- this.logger.info(`Scraping ${url}, normalized digest: ${digest}`, { url, digest });
217
 
218
  let snapshot: PageSnapshot | undefined;
219
  let screenshot: Buffer | undefined;
220
 
221
- if (!noCache) {
222
- const cached = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
223
-
224
- if (cached && cached.createdAt.valueOf() > (Date.now() - 1000 * 300)) {
225
- const age = Date.now() - cached.createdAt.valueOf();
226
- this.logger.info(`Cache hit for ${url}, normalized digest: ${digest}, ${age}ms old`, { url, digest, age });
227
- snapshot = {
228
- ...cached.snapshot
229
- };
230
- if (snapshot) {
231
- delete snapshot.screenshot;
232
- }
233
-
234
- screenshot = cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined;
235
- yield {
236
- ...cached.snapshot,
237
- screenshot: cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined
238
- };
239
-
240
- return;
241
- }
242
  }
243
 
244
- const page = await this.pagePool.acquire();
245
  let nextSnapshotDeferred = Defer();
246
  let finalized = false;
247
  const hdl = (s: any) => {
@@ -262,48 +259,43 @@ function giveSnapshot() {
262
  cause: err,
263
  }));
264
  }).finally(async () => {
265
- finalized = true;
266
  if (!snapshot?.html) {
 
267
  return;
268
  }
269
- screenshot = await page.screenshot({
270
- type: 'jpeg',
271
- quality: 75,
272
- });
273
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
 
274
  if (!snapshot.title || !snapshot.parsed?.content) {
275
  const salvaged = await this.salvage(url, page);
276
  if (salvaged) {
277
- screenshot = await page.screenshot({
278
- type: 'jpeg',
279
- quality: 75,
280
- });
281
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
 
282
  }
283
  }
284
- this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
285
- const nowDate = new Date();
286
- Crawled.save(
287
- Crawled.from({
288
- url,
289
- createdAt: nowDate,
290
- expireAt: new Date(nowDate.valueOf() + 1000 * 3600 * 24 * 7),
291
- urlPathDigest: digest,
292
- snapshot: { ...snapshot, screenshot: screenshot?.toString('base64') || '' },
293
- }).degradeForFireStore()
294
- ).catch((err) => {
295
- this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
296
- });
297
  });
298
 
299
  try {
 
300
  while (true) {
301
  await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
302
  if (finalized) {
303
  yield { ...snapshot, screenshot } as PageSnapshot;
304
  break;
305
  }
306
- yield snapshot;
 
 
 
 
 
 
307
  }
308
  } finally {
309
  gotoPromise.finally(() => {
@@ -333,6 +325,8 @@ function giveSnapshot() {
333
  this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
334
  });
335
 
 
 
336
  return true;
337
  }
338
  }
 
 
 
 
 
 
1
  import os from 'os';
2
  import fs from 'fs';
3
+ import { container, singleton } from 'tsyringe';
4
+ import genericPool from 'generic-pool';
5
+ import { AsyncService, Defer, marshalErrorLike, AssertionFailureError } from 'civkit';
6
+ import { Logger } from '../shared/services/logger';
7
+
8
+ import type { Browser, CookieParam, Page } from 'puppeteer';
9
  import puppeteer from 'puppeteer-extra';
10
 
11
+ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
12
+ import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
+
14
+
15
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
16
 
17
  export interface ImgBrief {
 
46
  screenshot?: Buffer;
47
  imgs?: ImgBrief[];
48
  }
49
+
50
+ export interface ScrappingOptions {
51
+ proxyUrl?: string;
52
+ cookies?: CookieParam[];
53
+ }
54
+
55
 
56
  const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
57
  puppeteer.use(puppeteerStealth());
 
60
  // userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
61
  // platform: `Linux`,
62
  // }))
63
+
64
  puppeteer.use(puppeteerBlockResources({
65
  blockedTypes: new Set(['media']),
66
+ interceptResolutionPriority: 1,
67
+ }));
68
+ puppeteer.use(puppeteerPageProxy({
69
+ interceptResolutionPriority: 1,
70
  }));
71
 
72
  @singleton()
 
87
  return page.browser().connected && !page.isClosed();
88
  }
89
  }, {
90
+ max: Math.max(1 + Math.floor(os.totalmem() / (384 * 1024 * 1024)), 16),
91
  min: 1,
92
  acquireTimeoutMillis: 60_000,
93
  testOnBorrow: true,
 
101
 
102
  override async init() {
103
  await this.dependencyReady();
104
+ this.logger.info(`PuppeteerControl initializing with pool size ${this.pagePool.max}`, { poolSize: this.pagePool.max });
105
  this.pagePool.start();
106
 
107
  if (this.browser) {
 
141
  // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
142
  preparations.push(page.setBypassCSP(true));
143
  preparations.push(page.setViewport({ width: 1024, height: 1024 }));
144
+ preparations.push(page.exposeFunction('reportSnapshot', (snapshot: PageSnapshot) => {
145
+ if (snapshot.href === 'about:blank') {
146
+ return;
147
+ }
148
  page.emit('snapshot', snapshot);
149
  }));
150
  preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
 
182
  const elem = document.createElement('div');
183
  elem.innerHTML = parsed.content;
184
  r.imgs = briefImgs(elem);
185
+ } else {
186
+ const allImgs = briefImgs();
187
+ if (allImgs.length === 1) {
188
+ r.imgs = allImgs;
189
+ }
190
  }
191
 
192
  return r;
193
  }
194
  `));
195
+ preparations.push(page.evaluateOnNewDocument(`
196
+ let aftershot = undefined;
197
+ const handlePageLoad = () => {
198
+ if (document.readyState !== 'complete') {
199
+ return;
200
+ }
201
+ const parsed = giveSnapshot();
202
+ window.reportSnapshot(parsed);
203
+ if (!parsed.text) {
204
+ if (aftershot) {
205
+ clearTimeout(aftershot);
206
+ }
207
+ aftershot = setTimeout(() => {
208
+ const r = giveSnapshot();
209
+ if (r && r.text) {
210
+ window.reportSnapshot(r);
211
+ }
212
+ }, 500);
213
+ }
214
+ };
215
+ document.addEventListener('readystatechange', handlePageLoad);
216
+ document.addEventListener('load', handlePageLoad);
217
+ `));
 
 
 
 
 
 
218
  await Promise.all(preparations);
219
 
220
  // TODO: further setup the page;
 
222
  return page;
223
  }
224
 
225
+ async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
 
226
  // parsedUrl.search = '';
227
+ const url = parsedUrl.toString();
228
+
229
+ this.logger.info(`Scraping ${url}`, { url });
 
230
 
231
  let snapshot: PageSnapshot | undefined;
232
  let screenshot: Buffer | undefined;
233
 
234
+ const page = await this.pagePool.acquire();
235
+ if (options.proxyUrl) {
236
+ await page.useProxy(options.proxyUrl);
237
+ }
238
+ if (options.cookies) {
239
+ await page.setCookie(...options.cookies);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  }
241
 
 
242
  let nextSnapshotDeferred = Defer();
243
  let finalized = false;
244
  const hdl = (s: any) => {
 
259
  cause: err,
260
  }));
261
  }).finally(async () => {
 
262
  if (!snapshot?.html) {
263
+ finalized = true;
264
  return;
265
  }
 
 
 
 
266
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
267
+ screenshot = await page.screenshot();
268
  if (!snapshot.title || !snapshot.parsed?.content) {
269
  const salvaged = await this.salvage(url, page);
270
  if (salvaged) {
 
 
 
 
271
  snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
272
+ screenshot = await page.screenshot();
273
  }
274
  }
275
+ finalized = true;
276
+ this.logger.info(`Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
277
+ this.emit(
278
+ 'crawled',
279
+ { ...snapshot, screenshot },
280
+ { ...options, url: parsedUrl }
281
+ );
 
 
 
 
 
 
282
  });
283
 
284
  try {
285
+ let lastHTML = snapshot?.html;
286
  while (true) {
287
  await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
288
  if (finalized) {
289
  yield { ...snapshot, screenshot } as PageSnapshot;
290
  break;
291
  }
292
+ if (snapshot?.title && snapshot?.html !== lastHTML) {
293
+ screenshot = await page.screenshot();
294
+ lastHTML = snapshot.html;
295
+ }
296
+ if (snapshot || screenshot) {
297
+ yield { ...snapshot, screenshot } as PageSnapshot;
298
+ }
299
  }
300
  } finally {
301
  gotoPromise.finally(() => {
 
325
  this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
326
  });
327
 
328
+ this.logger.info(`Salvation completed.`);
329
+
330
  return true;
331
  }
332
  }
backend/functions/src/utils/markdown.ts ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ export function tidyMarkdown(markdown: string): string {
3
+
4
+ // Step 1: Handle complex broken links with text and optional images spread across multiple lines
5
+ let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
6
+ // Remove internal new lines and excessive spaces within the text
7
+ text = text.replace(/\s+/g, ' ').trim();
8
+ url = url.replace(/\s+/g, '').trim();
9
+ return `[${text}](${url})`;
10
+ });
11
+
12
+ normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
13
+ // Normalize by removing excessive spaces and new lines
14
+ text = text.replace(/\s+/g, ' ').trim();
15
+ alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
16
+ imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
17
+ linkUrl = linkUrl.replace(/\s+/g, '').trim();
18
+ if (imgUrl) {
19
+ return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
20
+ } else {
21
+ return `[${text}](${linkUrl})`;
22
+ }
23
+ });
24
+
25
+ // Step 2: Normalize regular links that may be broken across lines
26
+ normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
27
+ text = text.replace(/\s+/g, ' ').trim();
28
+ url = url.replace(/\s+/g, '').trim();
29
+ return `[${text}](${url})`;
30
+ });
31
+
32
+ // Step 3: Replace more than two consecutive empty lines with exactly two empty lines
33
+ normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
34
+
35
+ // Step 4: Remove leading spaces from each line
36
+ normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
37
+
38
+ return normalizedMarkdown.trim();
39
+ }
backend/functions/src/utils/misc.ts ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ export function cleanAttribute(attribute: string) {
2
+ return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
3
+ }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 5990ab1ceb78c5800a959c4b17db3d8b3d01c432
 
1
+ Subproject commit 577131db50d5c86ffb3d085a593eaed8950eabcd