Spaces:
Build error
Build error
feat: control cache tolerance and select target using headers
Browse files
README.md
CHANGED
|
@@ -59,7 +59,11 @@ As you have already seen above, one can control the behavior of the Reader API u
|
|
| 59 |
- `x-respond-with: text` returns `document.body.innerText`
|
| 60 |
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
|
| 61 |
- You can specify a proxy server via the `x-proxy-url` header.
|
| 62 |
-
- You can
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
### Streaming mode
|
|
|
|
| 59 |
- `x-respond-with: text` returns `document.body.innerText`
|
| 60 |
- `x-respond-with: screenshot` returns the URL of the webpage's screenshot
|
| 61 |
- You can specify a proxy server via the `x-proxy-url` header.
|
| 62 |
+
- You can customize cache tolerance via the `x-cache-tolerance` header (integer in seconds).
|
| 63 |
+
- You can bypass the cached page (lifetime 3600s) via the `x-no-cache: true` header (equivalent of `x-cache-tolerance: 0`).
|
| 64 |
+
- If you already know the HTML structure of your target page, you may specify `x-target-selector` or `x-wait-for-selector` to direct the Reader API to focus on a specific part of the page.
|
| 65 |
+
- By setting `x-target-selector` header to a CSS selector, the Reader API return the content within the matched element, instead of the full HTML. Setting this header is useful when the automatic content extraction fails to capture the desired content and you can manually select the correct target.
|
| 66 |
+
- By setting `x-wait-for-selector` header to a CSS selector, the Reader API will wait until the matched element is rendered before returning the content. If you already specified `x-wait-for-selector`, this header can be omitted if you plan to wait for the same element.
|
| 67 |
|
| 68 |
|
| 69 |
### Streaming mode
|
backend/functions/package-lock.json
CHANGED
|
@@ -23,6 +23,7 @@
|
|
| 23 |
"generic-pool": "^3.9.0",
|
| 24 |
"htmlparser2": "^9.0.0",
|
| 25 |
"jose": "^5.1.0",
|
|
|
|
| 26 |
"langdetect": "^0.2.1",
|
| 27 |
"maxmind": "^4.3.18",
|
| 28 |
"minio": "^7.1.3",
|
|
@@ -4036,6 +4037,17 @@
|
|
| 4036 |
"node": ">= 8"
|
| 4037 |
}
|
| 4038 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4039 |
"node_modules/data-uri-to-buffer": {
|
| 4040 |
"version": "6.0.2",
|
| 4041 |
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
|
@@ -4044,6 +4056,41 @@
|
|
| 4044 |
"node": ">= 14"
|
| 4045 |
}
|
| 4046 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4047 |
"node_modules/data-view-buffer": {
|
| 4048 |
"version": "1.0.1",
|
| 4049 |
"resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
|
|
@@ -4116,6 +4163,11 @@
|
|
| 4116 |
}
|
| 4117 |
}
|
| 4118 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4119 |
"node_modules/decode-uri-component": {
|
| 4120 |
"version": "0.2.2",
|
| 4121 |
"resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
|
|
@@ -6119,6 +6171,17 @@
|
|
| 6119 |
"node": ">= 0.4"
|
| 6120 |
}
|
| 6121 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6122 |
"node_modules/html-escaper": {
|
| 6123 |
"version": "2.0.2",
|
| 6124 |
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
|
@@ -6307,7 +6370,6 @@
|
|
| 6307 |
"version": "0.6.3",
|
| 6308 |
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
| 6309 |
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
| 6310 |
-
"optional": true,
|
| 6311 |
"dependencies": {
|
| 6312 |
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
| 6313 |
},
|
|
@@ -6705,6 +6767,11 @@
|
|
| 6705 |
"node": ">=0.10.0"
|
| 6706 |
}
|
| 6707 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6708 |
"node_modules/is-regex": {
|
| 6709 |
"version": "1.1.4",
|
| 6710 |
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
|
|
@@ -7573,6 +7640,91 @@
|
|
| 7573 |
"node": ">=0.1.90"
|
| 7574 |
}
|
| 7575 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7576 |
"node_modules/jsesc": {
|
| 7577 |
"version": "2.5.2",
|
| 7578 |
"resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
|
|
@@ -8671,6 +8823,11 @@
|
|
| 8671 |
"set-blocking": "^2.0.0"
|
| 8672 |
}
|
| 8673 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8674 |
"node_modules/object-assign": {
|
| 8675 |
"version": "4.1.1",
|
| 8676 |
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
|
@@ -8985,6 +9142,17 @@
|
|
| 8985 |
"url": "https://github.com/sponsors/sindresorhus"
|
| 8986 |
}
|
| 8987 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8988 |
"node_modules/parseurl": {
|
| 8989 |
"version": "1.3.3",
|
| 8990 |
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
|
@@ -10185,6 +10353,11 @@
|
|
| 10185 |
"url": "https://github.com/sponsors/isaacs"
|
| 10186 |
}
|
| 10187 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10188 |
"node_modules/run-parallel": {
|
| 10189 |
"version": "1.2.0",
|
| 10190 |
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
|
|
@@ -10281,6 +10454,17 @@
|
|
| 10281 |
"resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
|
| 10282 |
"integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
|
| 10283 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10284 |
"node_modules/semver": {
|
| 10285 |
"version": "7.6.0",
|
| 10286 |
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
|
|
@@ -10941,6 +11125,11 @@
|
|
| 10941 |
"url": "https://github.com/sponsors/ljharb"
|
| 10942 |
}
|
| 10943 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10944 |
"node_modules/tar": {
|
| 10945 |
"version": "6.2.1",
|
| 10946 |
"resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
|
|
@@ -11589,6 +11778,17 @@
|
|
| 11589 |
"node": ">= 0.8"
|
| 11590 |
}
|
| 11591 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11592 |
"node_modules/walker": {
|
| 11593 |
"version": "1.0.8",
|
| 11594 |
"resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
|
|
@@ -11622,7 +11822,6 @@
|
|
| 11622 |
"version": "7.0.0",
|
| 11623 |
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
|
| 11624 |
"integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
|
| 11625 |
-
"peer": true,
|
| 11626 |
"engines": {
|
| 11627 |
"node": ">=12"
|
| 11628 |
}
|
|
@@ -11648,6 +11847,25 @@
|
|
| 11648 |
"node": ">=0.8.0"
|
| 11649 |
}
|
| 11650 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11651 |
"node_modules/whatwg-url": {
|
| 11652 |
"version": "11.0.0",
|
| 11653 |
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
|
|
@@ -11778,6 +11996,14 @@
|
|
| 11778 |
"resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
|
| 11779 |
"integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
|
| 11780 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11781 |
"node_modules/xml2js": {
|
| 11782 |
"version": "0.5.0",
|
| 11783 |
"resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
|
|
@@ -11798,6 +12024,11 @@
|
|
| 11798 |
"node": ">=4.0"
|
| 11799 |
}
|
| 11800 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11801 |
"node_modules/y18n": {
|
| 11802 |
"version": "5.0.8",
|
| 11803 |
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
|
|
|
| 23 |
"generic-pool": "^3.9.0",
|
| 24 |
"htmlparser2": "^9.0.0",
|
| 25 |
"jose": "^5.1.0",
|
| 26 |
+
"jsdom": "^24.0.0",
|
| 27 |
"langdetect": "^0.2.1",
|
| 28 |
"maxmind": "^4.3.18",
|
| 29 |
"minio": "^7.1.3",
|
|
|
|
| 4037 |
"node": ">= 8"
|
| 4038 |
}
|
| 4039 |
},
|
| 4040 |
+
"node_modules/cssstyle": {
|
| 4041 |
+
"version": "4.0.1",
|
| 4042 |
+
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz",
|
| 4043 |
+
"integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==",
|
| 4044 |
+
"dependencies": {
|
| 4045 |
+
"rrweb-cssom": "^0.6.0"
|
| 4046 |
+
},
|
| 4047 |
+
"engines": {
|
| 4048 |
+
"node": ">=18"
|
| 4049 |
+
}
|
| 4050 |
+
},
|
| 4051 |
"node_modules/data-uri-to-buffer": {
|
| 4052 |
"version": "6.0.2",
|
| 4053 |
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
|
|
|
| 4056 |
"node": ">= 14"
|
| 4057 |
}
|
| 4058 |
},
|
| 4059 |
+
"node_modules/data-urls": {
|
| 4060 |
+
"version": "5.0.0",
|
| 4061 |
+
"resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz",
|
| 4062 |
+
"integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==",
|
| 4063 |
+
"dependencies": {
|
| 4064 |
+
"whatwg-mimetype": "^4.0.0",
|
| 4065 |
+
"whatwg-url": "^14.0.0"
|
| 4066 |
+
},
|
| 4067 |
+
"engines": {
|
| 4068 |
+
"node": ">=18"
|
| 4069 |
+
}
|
| 4070 |
+
},
|
| 4071 |
+
"node_modules/data-urls/node_modules/tr46": {
|
| 4072 |
+
"version": "5.0.0",
|
| 4073 |
+
"resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
|
| 4074 |
+
"integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
|
| 4075 |
+
"dependencies": {
|
| 4076 |
+
"punycode": "^2.3.1"
|
| 4077 |
+
},
|
| 4078 |
+
"engines": {
|
| 4079 |
+
"node": ">=18"
|
| 4080 |
+
}
|
| 4081 |
+
},
|
| 4082 |
+
"node_modules/data-urls/node_modules/whatwg-url": {
|
| 4083 |
+
"version": "14.0.0",
|
| 4084 |
+
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
|
| 4085 |
+
"integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
|
| 4086 |
+
"dependencies": {
|
| 4087 |
+
"tr46": "^5.0.0",
|
| 4088 |
+
"webidl-conversions": "^7.0.0"
|
| 4089 |
+
},
|
| 4090 |
+
"engines": {
|
| 4091 |
+
"node": ">=18"
|
| 4092 |
+
}
|
| 4093 |
+
},
|
| 4094 |
"node_modules/data-view-buffer": {
|
| 4095 |
"version": "1.0.1",
|
| 4096 |
"resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
|
|
|
|
| 4163 |
}
|
| 4164 |
}
|
| 4165 |
},
|
| 4166 |
+
"node_modules/decimal.js": {
|
| 4167 |
+
"version": "10.4.3",
|
| 4168 |
+
"resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz",
|
| 4169 |
+
"integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA=="
|
| 4170 |
+
},
|
| 4171 |
"node_modules/decode-uri-component": {
|
| 4172 |
"version": "0.2.2",
|
| 4173 |
"resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
|
|
|
|
| 6171 |
"node": ">= 0.4"
|
| 6172 |
}
|
| 6173 |
},
|
| 6174 |
+
"node_modules/html-encoding-sniffer": {
|
| 6175 |
+
"version": "4.0.0",
|
| 6176 |
+
"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
|
| 6177 |
+
"integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
|
| 6178 |
+
"dependencies": {
|
| 6179 |
+
"whatwg-encoding": "^3.1.1"
|
| 6180 |
+
},
|
| 6181 |
+
"engines": {
|
| 6182 |
+
"node": ">=18"
|
| 6183 |
+
}
|
| 6184 |
+
},
|
| 6185 |
"node_modules/html-escaper": {
|
| 6186 |
"version": "2.0.2",
|
| 6187 |
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
|
|
|
| 6370 |
"version": "0.6.3",
|
| 6371 |
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
| 6372 |
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
|
|
|
| 6373 |
"dependencies": {
|
| 6374 |
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
| 6375 |
},
|
|
|
|
| 6767 |
"node": ">=0.10.0"
|
| 6768 |
}
|
| 6769 |
},
|
| 6770 |
+
"node_modules/is-potential-custom-element-name": {
|
| 6771 |
+
"version": "1.0.1",
|
| 6772 |
+
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
| 6773 |
+
"integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="
|
| 6774 |
+
},
|
| 6775 |
"node_modules/is-regex": {
|
| 6776 |
"version": "1.1.4",
|
| 6777 |
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
|
|
|
|
| 7640 |
"node": ">=0.1.90"
|
| 7641 |
}
|
| 7642 |
},
|
| 7643 |
+
"node_modules/jsdom": {
|
| 7644 |
+
"version": "24.0.0",
|
| 7645 |
+
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz",
|
| 7646 |
+
"integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==",
|
| 7647 |
+
"dependencies": {
|
| 7648 |
+
"cssstyle": "^4.0.1",
|
| 7649 |
+
"data-urls": "^5.0.0",
|
| 7650 |
+
"decimal.js": "^10.4.3",
|
| 7651 |
+
"form-data": "^4.0.0",
|
| 7652 |
+
"html-encoding-sniffer": "^4.0.0",
|
| 7653 |
+
"http-proxy-agent": "^7.0.0",
|
| 7654 |
+
"https-proxy-agent": "^7.0.2",
|
| 7655 |
+
"is-potential-custom-element-name": "^1.0.1",
|
| 7656 |
+
"nwsapi": "^2.2.7",
|
| 7657 |
+
"parse5": "^7.1.2",
|
| 7658 |
+
"rrweb-cssom": "^0.6.0",
|
| 7659 |
+
"saxes": "^6.0.0",
|
| 7660 |
+
"symbol-tree": "^3.2.4",
|
| 7661 |
+
"tough-cookie": "^4.1.3",
|
| 7662 |
+
"w3c-xmlserializer": "^5.0.0",
|
| 7663 |
+
"webidl-conversions": "^7.0.0",
|
| 7664 |
+
"whatwg-encoding": "^3.1.1",
|
| 7665 |
+
"whatwg-mimetype": "^4.0.0",
|
| 7666 |
+
"whatwg-url": "^14.0.0",
|
| 7667 |
+
"ws": "^8.16.0",
|
| 7668 |
+
"xml-name-validator": "^5.0.0"
|
| 7669 |
+
},
|
| 7670 |
+
"engines": {
|
| 7671 |
+
"node": ">=18"
|
| 7672 |
+
},
|
| 7673 |
+
"peerDependencies": {
|
| 7674 |
+
"canvas": "^2.11.2"
|
| 7675 |
+
},
|
| 7676 |
+
"peerDependenciesMeta": {
|
| 7677 |
+
"canvas": {
|
| 7678 |
+
"optional": true
|
| 7679 |
+
}
|
| 7680 |
+
}
|
| 7681 |
+
},
|
| 7682 |
+
"node_modules/jsdom/node_modules/agent-base": {
|
| 7683 |
+
"version": "7.1.1",
|
| 7684 |
+
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz",
|
| 7685 |
+
"integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==",
|
| 7686 |
+
"dependencies": {
|
| 7687 |
+
"debug": "^4.3.4"
|
| 7688 |
+
},
|
| 7689 |
+
"engines": {
|
| 7690 |
+
"node": ">= 14"
|
| 7691 |
+
}
|
| 7692 |
+
},
|
| 7693 |
+
"node_modules/jsdom/node_modules/https-proxy-agent": {
|
| 7694 |
+
"version": "7.0.4",
|
| 7695 |
+
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz",
|
| 7696 |
+
"integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==",
|
| 7697 |
+
"dependencies": {
|
| 7698 |
+
"agent-base": "^7.0.2",
|
| 7699 |
+
"debug": "4"
|
| 7700 |
+
},
|
| 7701 |
+
"engines": {
|
| 7702 |
+
"node": ">= 14"
|
| 7703 |
+
}
|
| 7704 |
+
},
|
| 7705 |
+
"node_modules/jsdom/node_modules/tr46": {
|
| 7706 |
+
"version": "5.0.0",
|
| 7707 |
+
"resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
|
| 7708 |
+
"integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
|
| 7709 |
+
"dependencies": {
|
| 7710 |
+
"punycode": "^2.3.1"
|
| 7711 |
+
},
|
| 7712 |
+
"engines": {
|
| 7713 |
+
"node": ">=18"
|
| 7714 |
+
}
|
| 7715 |
+
},
|
| 7716 |
+
"node_modules/jsdom/node_modules/whatwg-url": {
|
| 7717 |
+
"version": "14.0.0",
|
| 7718 |
+
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
|
| 7719 |
+
"integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
|
| 7720 |
+
"dependencies": {
|
| 7721 |
+
"tr46": "^5.0.0",
|
| 7722 |
+
"webidl-conversions": "^7.0.0"
|
| 7723 |
+
},
|
| 7724 |
+
"engines": {
|
| 7725 |
+
"node": ">=18"
|
| 7726 |
+
}
|
| 7727 |
+
},
|
| 7728 |
"node_modules/jsesc": {
|
| 7729 |
"version": "2.5.2",
|
| 7730 |
"resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
|
|
|
|
| 8823 |
"set-blocking": "^2.0.0"
|
| 8824 |
}
|
| 8825 |
},
|
| 8826 |
+
"node_modules/nwsapi": {
|
| 8827 |
+
"version": "2.2.10",
|
| 8828 |
+
"resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.10.tgz",
|
| 8829 |
+
"integrity": "sha512-QK0sRs7MKv0tKe1+5uZIQk/C8XGza4DAnztJG8iD+TpJIORARrCxczA738awHrZoHeTjSSoHqao2teO0dC/gFQ=="
|
| 8830 |
+
},
|
| 8831 |
"node_modules/object-assign": {
|
| 8832 |
"version": "4.1.1",
|
| 8833 |
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
|
|
|
| 9142 |
"url": "https://github.com/sponsors/sindresorhus"
|
| 9143 |
}
|
| 9144 |
},
|
| 9145 |
+
"node_modules/parse5": {
|
| 9146 |
+
"version": "7.1.2",
|
| 9147 |
+
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
|
| 9148 |
+
"integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==",
|
| 9149 |
+
"dependencies": {
|
| 9150 |
+
"entities": "^4.4.0"
|
| 9151 |
+
},
|
| 9152 |
+
"funding": {
|
| 9153 |
+
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
| 9154 |
+
}
|
| 9155 |
+
},
|
| 9156 |
"node_modules/parseurl": {
|
| 9157 |
"version": "1.3.3",
|
| 9158 |
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
|
|
|
| 10353 |
"url": "https://github.com/sponsors/isaacs"
|
| 10354 |
}
|
| 10355 |
},
|
| 10356 |
+
"node_modules/rrweb-cssom": {
|
| 10357 |
+
"version": "0.6.0",
|
| 10358 |
+
"resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
|
| 10359 |
+
"integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw=="
|
| 10360 |
+
},
|
| 10361 |
"node_modules/run-parallel": {
|
| 10362 |
"version": "1.2.0",
|
| 10363 |
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
|
|
|
|
| 10454 |
"resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
|
| 10455 |
"integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
|
| 10456 |
},
|
| 10457 |
+
"node_modules/saxes": {
|
| 10458 |
+
"version": "6.0.0",
|
| 10459 |
+
"resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz",
|
| 10460 |
+
"integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==",
|
| 10461 |
+
"dependencies": {
|
| 10462 |
+
"xmlchars": "^2.2.0"
|
| 10463 |
+
},
|
| 10464 |
+
"engines": {
|
| 10465 |
+
"node": ">=v12.22.7"
|
| 10466 |
+
}
|
| 10467 |
+
},
|
| 10468 |
"node_modules/semver": {
|
| 10469 |
"version": "7.6.0",
|
| 10470 |
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
|
|
|
|
| 11125 |
"url": "https://github.com/sponsors/ljharb"
|
| 11126 |
}
|
| 11127 |
},
|
| 11128 |
+
"node_modules/symbol-tree": {
|
| 11129 |
+
"version": "3.2.4",
|
| 11130 |
+
"resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
|
| 11131 |
+
"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
|
| 11132 |
+
},
|
| 11133 |
"node_modules/tar": {
|
| 11134 |
"version": "6.2.1",
|
| 11135 |
"resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
|
|
|
|
| 11778 |
"node": ">= 0.8"
|
| 11779 |
}
|
| 11780 |
},
|
| 11781 |
+
"node_modules/w3c-xmlserializer": {
|
| 11782 |
+
"version": "5.0.0",
|
| 11783 |
+
"resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz",
|
| 11784 |
+
"integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==",
|
| 11785 |
+
"dependencies": {
|
| 11786 |
+
"xml-name-validator": "^5.0.0"
|
| 11787 |
+
},
|
| 11788 |
+
"engines": {
|
| 11789 |
+
"node": ">=18"
|
| 11790 |
+
}
|
| 11791 |
+
},
|
| 11792 |
"node_modules/walker": {
|
| 11793 |
"version": "1.0.8",
|
| 11794 |
"resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
|
|
|
|
| 11822 |
"version": "7.0.0",
|
| 11823 |
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
|
| 11824 |
"integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
|
|
|
|
| 11825 |
"engines": {
|
| 11826 |
"node": ">=12"
|
| 11827 |
}
|
|
|
|
| 11847 |
"node": ">=0.8.0"
|
| 11848 |
}
|
| 11849 |
},
|
| 11850 |
+
"node_modules/whatwg-encoding": {
|
| 11851 |
+
"version": "3.1.1",
|
| 11852 |
+
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
| 11853 |
+
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
| 11854 |
+
"dependencies": {
|
| 11855 |
+
"iconv-lite": "0.6.3"
|
| 11856 |
+
},
|
| 11857 |
+
"engines": {
|
| 11858 |
+
"node": ">=18"
|
| 11859 |
+
}
|
| 11860 |
+
},
|
| 11861 |
+
"node_modules/whatwg-mimetype": {
|
| 11862 |
+
"version": "4.0.0",
|
| 11863 |
+
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
|
| 11864 |
+
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
|
| 11865 |
+
"engines": {
|
| 11866 |
+
"node": ">=18"
|
| 11867 |
+
}
|
| 11868 |
+
},
|
| 11869 |
"node_modules/whatwg-url": {
|
| 11870 |
"version": "11.0.0",
|
| 11871 |
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
|
|
|
|
| 11996 |
"resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
|
| 11997 |
"integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
|
| 11998 |
},
|
| 11999 |
+
"node_modules/xml-name-validator": {
|
| 12000 |
+
"version": "5.0.0",
|
| 12001 |
+
"resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz",
|
| 12002 |
+
"integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==",
|
| 12003 |
+
"engines": {
|
| 12004 |
+
"node": ">=18"
|
| 12005 |
+
}
|
| 12006 |
+
},
|
| 12007 |
"node_modules/xml2js": {
|
| 12008 |
"version": "0.5.0",
|
| 12009 |
"resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
|
|
|
|
| 12024 |
"node": ">=4.0"
|
| 12025 |
}
|
| 12026 |
},
|
| 12027 |
+
"node_modules/xmlchars": {
|
| 12028 |
+
"version": "2.2.0",
|
| 12029 |
+
"resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
|
| 12030 |
+
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
|
| 12031 |
+
},
|
| 12032 |
"node_modules/y18n": {
|
| 12033 |
"version": "5.0.8",
|
| 12034 |
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
backend/functions/package.json
CHANGED
|
@@ -43,6 +43,7 @@
|
|
| 43 |
"generic-pool": "^3.9.0",
|
| 44 |
"htmlparser2": "^9.0.0",
|
| 45 |
"jose": "^5.1.0",
|
|
|
|
| 46 |
"langdetect": "^0.2.1",
|
| 47 |
"maxmind": "^4.3.18",
|
| 48 |
"minio": "^7.1.3",
|
|
|
|
| 43 |
"generic-pool": "^3.9.0",
|
| 44 |
"htmlparser2": "^9.0.0",
|
| 45 |
"jose": "^5.1.0",
|
| 46 |
+
"jsdom": "^24.0.0",
|
| 47 |
"langdetect": "^0.2.1",
|
| 48 |
"maxmind": "^4.3.18",
|
| 49 |
"minio": "^7.1.3",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -24,6 +24,10 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
|
| 24 |
|
| 25 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
@singleton()
|
| 28 |
export class CrawlerHost extends RPCHost {
|
| 29 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
|
@@ -31,7 +35,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 31 |
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
| 32 |
|
| 33 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 34 |
-
cacheValidMs = 1000 *
|
| 35 |
urlValidMs = 1000 * 3600 * 4;
|
| 36 |
|
| 37 |
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
|
|
@@ -299,8 +303,13 @@ ${this.content}
|
|
| 299 |
in: 'header',
|
| 300 |
schema: { type: 'string' }
|
| 301 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
'X-No-Cache': {
|
| 303 |
-
description: `Ignores internal cache if this header is specified with a value.`,
|
| 304 |
in: 'header',
|
| 305 |
schema: { type: 'string' }
|
| 306 |
},
|
|
@@ -315,6 +324,20 @@ ${this.content}
|
|
| 315 |
in: 'header',
|
| 316 |
schema: { type: 'string' }
|
| 317 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
'X-Proxy-Url': {
|
| 319 |
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
| 320 |
`Supported protocols:\n` +
|
|
@@ -426,7 +449,15 @@ ${this.content}
|
|
| 426 |
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
| 427 |
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 428 |
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
const cookies: CookieParam[] = [];
|
| 431 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 432 |
if (Array.isArray(setCookieHeaders)) {
|
|
@@ -444,10 +475,12 @@ ${this.content}
|
|
| 444 |
}
|
| 445 |
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
| 446 |
|
| 447 |
-
const crawlOpts:
|
| 448 |
proxyUrl: ctx.req.get('x-proxy-url'),
|
| 449 |
cookies,
|
| 450 |
-
favorScreenshot: customMode === 'screenshot'
|
|
|
|
|
|
|
| 451 |
};
|
| 452 |
|
| 453 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
@@ -484,7 +517,7 @@ ${this.content}
|
|
| 484 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 485 |
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
| 486 |
lastScrapped = scrapped;
|
| 487 |
-
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
| 488 |
continue;
|
| 489 |
}
|
| 490 |
|
|
@@ -506,7 +539,7 @@ ${this.content}
|
|
| 506 |
|
| 507 |
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
| 508 |
lastScrapped = scrapped;
|
| 509 |
-
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
| 510 |
continue;
|
| 511 |
}
|
| 512 |
|
|
@@ -642,24 +675,32 @@ ${this.content}
|
|
| 642 |
return r;
|
| 643 |
}
|
| 644 |
|
| 645 |
-
async *cachedScrap(urlToCrawl: URL, crawlOpts?:
|
| 646 |
let cache;
|
| 647 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 648 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 649 |
}
|
| 650 |
|
| 651 |
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
| 652 |
-
yield cache.snapshot;
|
| 653 |
|
| 654 |
return;
|
| 655 |
}
|
| 656 |
|
| 657 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
| 659 |
} catch (err: any) {
|
| 660 |
if (cache) {
|
| 661 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 662 |
-
yield cache.snapshot;
|
| 663 |
return;
|
| 664 |
}
|
| 665 |
throw err;
|
|
|
|
| 24 |
|
| 25 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 26 |
|
| 27 |
+
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 28 |
+
targetSelector?: string;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
@singleton()
|
| 32 |
export class CrawlerHost extends RPCHost {
|
| 33 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
|
|
|
| 35 |
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
| 36 |
|
| 37 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 38 |
+
cacheValidMs = 1000 * 3600;
|
| 39 |
urlValidMs = 1000 * 3600 * 4;
|
| 40 |
|
| 41 |
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
|
|
|
|
| 303 |
in: 'header',
|
| 304 |
schema: { type: 'string' }
|
| 305 |
},
|
| 306 |
+
'X-Cache-Tolerance': {
|
| 307 |
+
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
|
| 308 |
+
in: 'header',
|
| 309 |
+
schema: { type: 'string' }
|
| 310 |
+
},
|
| 311 |
'X-No-Cache': {
|
| 312 |
+
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
|
| 313 |
in: 'header',
|
| 314 |
schema: { type: 'string' }
|
| 315 |
},
|
|
|
|
| 324 |
in: 'header',
|
| 325 |
schema: { type: 'string' }
|
| 326 |
},
|
| 327 |
+
'X-Wait-For-Selector': {
|
| 328 |
+
description: `Specifies a CSS selector to wait for the appearance of such an element before returning. \n\n` +
|
| 329 |
+
'Example: `X-Wait-For-Selector: .content-block`\n'
|
| 330 |
+
,
|
| 331 |
+
in: 'header',
|
| 332 |
+
schema: { type: 'string' }
|
| 333 |
+
},
|
| 334 |
+
'X-Target-Selector': {
|
| 335 |
+
description: `Specifies a CSS selector for return target instead of the full html. \n\n` +
|
| 336 |
+
'Implies `X-Wait-For-Selector: (same selector)`'
|
| 337 |
+
,
|
| 338 |
+
in: 'header',
|
| 339 |
+
schema: { type: 'string' }
|
| 340 |
+
},
|
| 341 |
'X-Proxy-Url': {
|
| 342 |
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
| 343 |
`Supported protocols:\n` +
|
|
|
|
| 449 |
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
| 450 |
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 451 |
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 452 |
+
let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
| 453 |
+
if (isNaN(cacheTolerance)) {
|
| 454 |
+
cacheTolerance = this.cacheValidMs;
|
| 455 |
+
if (noCache) {
|
| 456 |
+
cacheTolerance = 0;
|
| 457 |
+
}
|
| 458 |
+
}
|
| 459 |
+
const targetSelector = ctx.req.get('x-target-selector') || undefined;
|
| 460 |
+
const waitForSelector = ctx.req.get('x-wait-for-selector') || targetSelector;
|
| 461 |
const cookies: CookieParam[] = [];
|
| 462 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 463 |
if (Array.isArray(setCookieHeaders)) {
|
|
|
|
| 475 |
}
|
| 476 |
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
| 477 |
|
| 478 |
+
const crawlOpts: ExtraScrappingOptions = {
|
| 479 |
proxyUrl: ctx.req.get('x-proxy-url'),
|
| 480 |
cookies,
|
| 481 |
+
favorScreenshot: customMode === 'screenshot',
|
| 482 |
+
waitForSelector,
|
| 483 |
+
targetSelector,
|
| 484 |
};
|
| 485 |
|
| 486 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
|
|
| 517 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 518 |
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
| 519 |
lastScrapped = scrapped;
|
| 520 |
+
if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
| 521 |
continue;
|
| 522 |
}
|
| 523 |
|
|
|
|
| 539 |
|
| 540 |
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
|
| 541 |
lastScrapped = scrapped;
|
| 542 |
+
if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
|
| 543 |
continue;
|
| 544 |
}
|
| 545 |
|
|
|
|
| 675 |
return r;
|
| 676 |
}
|
| 677 |
|
| 678 |
+
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
| 679 |
let cache;
|
| 680 |
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
| 681 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 682 |
}
|
| 683 |
|
| 684 |
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
| 685 |
+
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
|
| 686 |
|
| 687 |
return;
|
| 688 |
}
|
| 689 |
|
| 690 |
try {
|
| 691 |
+
if (crawlOpts?.targetSelector) {
|
| 692 |
+
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 693 |
+
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector);
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
return;
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
| 700 |
} catch (err: any) {
|
| 701 |
if (cache) {
|
| 702 |
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
| 703 |
+
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
|
| 704 |
return;
|
| 705 |
}
|
| 706 |
throw err;
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -181,7 +181,13 @@ export class SearcherHost extends RPCHost {
|
|
| 181 |
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
| 182 |
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 183 |
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
const cookies: CookieParam[] = [];
|
| 186 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 187 |
if (Array.isArray(setCookieHeaders)) {
|
|
|
|
| 181 |
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
|
| 182 |
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
| 183 |
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
| 184 |
+
let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
|
| 185 |
+
if (isNaN(pageCacheTolerance)) {
|
| 186 |
+
pageCacheTolerance = this.pageCacheToleranceMs;
|
| 187 |
+
if (noCache) {
|
| 188 |
+
pageCacheTolerance = 0;
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
const cookies: CookieParam[] = [];
|
| 192 |
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
| 193 |
if (Array.isArray(setCookieHeaders)) {
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { container, singleton } from 'tsyringe';
|
|
| 4 |
import genericPool from 'generic-pool';
|
| 5 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
| 6 |
import { Logger } from '../shared/services/logger';
|
|
|
|
| 7 |
|
| 8 |
import type { Browser, CookieParam, Page } from 'puppeteer';
|
| 9 |
import puppeteer from 'puppeteer-extra';
|
|
@@ -11,7 +12,7 @@ import puppeteer from 'puppeteer-extra';
|
|
| 11 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
| 12 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 13 |
import { ServiceCrashedError } from '../shared/lib/errors';
|
| 14 |
-
|
| 15 |
|
| 16 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 17 |
|
|
@@ -52,6 +53,7 @@ export interface ScrappingOptions {
|
|
| 52 |
proxyUrl?: string;
|
| 53 |
cookies?: CookieParam[];
|
| 54 |
favorScreenshot?: boolean;
|
|
|
|
| 55 |
}
|
| 56 |
|
| 57 |
|
|
@@ -142,7 +144,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 142 |
this.browser.once('disconnected', () => {
|
| 143 |
this.logger.warn(`Browser disconnected`);
|
| 144 |
this.emit('crippled');
|
| 145 |
-
process.nextTick(()=> this.serviceReady());
|
| 146 |
});
|
| 147 |
this.logger.info(`Browser launched: ${this.browser.process()?.pid}`);
|
| 148 |
|
|
@@ -344,6 +346,18 @@ document.addEventListener('load', handlePageLoad);
|
|
| 344 |
{ ...options, url: parsedUrl }
|
| 345 |
);
|
| 346 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
try {
|
| 349 |
let lastHTML = snapshot?.html;
|
|
@@ -394,6 +408,49 @@ document.addEventListener('load', handlePageLoad);
|
|
| 394 |
|
| 395 |
return true;
|
| 396 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
}
|
| 398 |
|
| 399 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
|
|
|
| 4 |
import genericPool from 'generic-pool';
|
| 5 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
| 6 |
import { Logger } from '../shared/services/logger';
|
| 7 |
+
import { JSDOM } from 'jsdom';
|
| 8 |
|
| 9 |
import type { Browser, CookieParam, Page } from 'puppeteer';
|
| 10 |
import puppeteer from 'puppeteer-extra';
|
|
|
|
| 12 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
| 13 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 14 |
import { ServiceCrashedError } from '../shared/lib/errors';
|
| 15 |
+
import { Readability } from '@mozilla/readability';
|
| 16 |
|
| 17 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 18 |
|
|
|
|
| 53 |
proxyUrl?: string;
|
| 54 |
cookies?: CookieParam[];
|
| 55 |
favorScreenshot?: boolean;
|
| 56 |
+
waitForSelector?: string;
|
| 57 |
}
|
| 58 |
|
| 59 |
|
|
|
|
| 144 |
this.browser.once('disconnected', () => {
|
| 145 |
this.logger.warn(`Browser disconnected`);
|
| 146 |
this.emit('crippled');
|
| 147 |
+
process.nextTick(() => this.serviceReady());
|
| 148 |
});
|
| 149 |
this.logger.info(`Browser launched: ${this.browser.process()?.pid}`);
|
| 150 |
|
|
|
|
| 346 |
{ ...options, url: parsedUrl }
|
| 347 |
);
|
| 348 |
});
|
| 349 |
+
if (options?.waitForSelector) {
|
| 350 |
+
page.waitForSelector(options.waitForSelector)
|
| 351 |
+
.then(async () => {
|
| 352 |
+
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 353 |
+
screenshot = await page.screenshot();
|
| 354 |
+
finalized = true;
|
| 355 |
+
nextSnapshotDeferred.resolve(snapshot);
|
| 356 |
+
})
|
| 357 |
+
.catch((err) => {
|
| 358 |
+
this.logger.warn(`Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
| 359 |
+
});
|
| 360 |
+
}
|
| 361 |
|
| 362 |
try {
|
| 363 |
let lastHTML = snapshot?.html;
|
|
|
|
| 408 |
|
| 409 |
return true;
|
| 410 |
}
|
| 411 |
+
|
| 412 |
+
narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined {
|
| 413 |
+
if (!targetSelect) {
|
| 414 |
+
return snapshot;
|
| 415 |
+
}
|
| 416 |
+
if (!snapshot?.html) {
|
| 417 |
+
return snapshot;
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href });
|
| 421 |
+
const elem = jsdom.window.document.querySelector(targetSelect);
|
| 422 |
+
|
| 423 |
+
if (!elem) {
|
| 424 |
+
return snapshot;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href });
|
| 428 |
+
let parsed;
|
| 429 |
+
try {
|
| 430 |
+
parsed = new Readability(selectedJsDom.window.document).parse();
|
| 431 |
+
} catch (err: any) {
|
| 432 |
+
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
// No innerText in jsdom
|
| 436 |
+
// https://github.com/jsdom/jsdom/issues/1245
|
| 437 |
+
const textContent = elem.textContent;
|
| 438 |
+
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
| 439 |
+
|
| 440 |
+
const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]')).map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]).flat().filter(Boolean);
|
| 441 |
+
|
| 442 |
+
const imageSet = new Set(imageTags);
|
| 443 |
+
|
| 444 |
+
const r = {
|
| 445 |
+
...snapshot,
|
| 446 |
+
parsed,
|
| 447 |
+
html: elem.outerHTML,
|
| 448 |
+
text: cleanedText,
|
| 449 |
+
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
| 450 |
+
} as PageSnapshot;
|
| 451 |
+
|
| 452 |
+
return r;
|
| 453 |
+
}
|
| 454 |
}
|
| 455 |
|
| 456 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
backend/functions/src/types.d.ts
CHANGED
|
@@ -7,3 +7,10 @@ declare module 'langdetect' {
|
|
| 7 |
export function detect(text: string): DetectionResult[];
|
| 8 |
export function detectOne(text: string): string | null;
|
| 9 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
export function detect(text: string): DetectionResult[];
|
| 8 |
export function detectOne(text: string): string | null;
|
| 9 |
}
|
| 10 |
+
|
| 11 |
+
declare module 'jsdom' {
|
| 12 |
+
export class JSDOM {
|
| 13 |
+
constructor(html: string, options?: any);
|
| 14 |
+
window: typeof window;
|
| 15 |
+
}
|
| 16 |
+
}
|