nomagick commited on
Commit
6f65083
·
unverified ·
1 Parent(s): 029f568

feat: control cache tolerance and select target using headers

Browse files
README.md CHANGED
@@ -59,7 +59,11 @@ As you have already seen above, one can control the behavior of the Reader API u
59
  - `x-respond-with: text` returns `document.body.innerText`
60
  - `x-respond-with: screenshot` returns the URL of the webpage's screenshot
61
  - You can specify a proxy server via the `x-proxy-url` header.
62
- - You can bypass the cached page (lifetime 300s) via the `x-no-cache` header.
 
 
 
 
63
 
64
 
65
  ### Streaming mode
 
59
  - `x-respond-with: text` returns `document.body.innerText`
60
  - `x-respond-with: screenshot` returns the URL of the webpage's screenshot
61
  - You can specify a proxy server via the `x-proxy-url` header.
62
+ - You can customize cache tolerance via the `x-cache-tolerance` header (integer in seconds).
63
+ - You can bypass the cached page (lifetime 3600s) via the `x-no-cache: true` header (equivalent of `x-cache-tolerance: 0`).
64
+ - If you already know the HTML structure of your target page, you may specify `x-target-selector` or `x-wait-for-selector` to direct the Reader API to focus on a specific part of the page.
65
+ - By setting `x-target-selector` header to a CSS selector, the Reader API return the content within the matched element, instead of the full HTML. Setting this header is useful when the automatic content extraction fails to capture the desired content and you can manually select the correct target.
66
+ - By setting `x-wait-for-selector` header to a CSS selector, the Reader API will wait until the matched element is rendered before returning the content. If you already specified `x-wait-for-selector`, this header can be omitted if you plan to wait for the same element.
67
 
68
 
69
  ### Streaming mode
backend/functions/package-lock.json CHANGED
@@ -23,6 +23,7 @@
23
  "generic-pool": "^3.9.0",
24
  "htmlparser2": "^9.0.0",
25
  "jose": "^5.1.0",
 
26
  "langdetect": "^0.2.1",
27
  "maxmind": "^4.3.18",
28
  "minio": "^7.1.3",
@@ -4036,6 +4037,17 @@
4036
  "node": ">= 8"
4037
  }
4038
  },
 
 
 
 
 
 
 
 
 
 
 
4039
  "node_modules/data-uri-to-buffer": {
4040
  "version": "6.0.2",
4041
  "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
@@ -4044,6 +4056,41 @@
4044
  "node": ">= 14"
4045
  }
4046
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4047
  "node_modules/data-view-buffer": {
4048
  "version": "1.0.1",
4049
  "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
@@ -4116,6 +4163,11 @@
4116
  }
4117
  }
4118
  },
 
 
 
 
 
4119
  "node_modules/decode-uri-component": {
4120
  "version": "0.2.2",
4121
  "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
@@ -6119,6 +6171,17 @@
6119
  "node": ">= 0.4"
6120
  }
6121
  },
 
 
 
 
 
 
 
 
 
 
 
6122
  "node_modules/html-escaper": {
6123
  "version": "2.0.2",
6124
  "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
@@ -6307,7 +6370,6 @@
6307
  "version": "0.6.3",
6308
  "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
6309
  "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
6310
- "optional": true,
6311
  "dependencies": {
6312
  "safer-buffer": ">= 2.1.2 < 3.0.0"
6313
  },
@@ -6705,6 +6767,11 @@
6705
  "node": ">=0.10.0"
6706
  }
6707
  },
 
 
 
 
 
6708
  "node_modules/is-regex": {
6709
  "version": "1.1.4",
6710
  "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
@@ -7573,6 +7640,91 @@
7573
  "node": ">=0.1.90"
7574
  }
7575
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7576
  "node_modules/jsesc": {
7577
  "version": "2.5.2",
7578
  "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
@@ -8671,6 +8823,11 @@
8671
  "set-blocking": "^2.0.0"
8672
  }
8673
  },
 
 
 
 
 
8674
  "node_modules/object-assign": {
8675
  "version": "4.1.1",
8676
  "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
@@ -8985,6 +9142,17 @@
8985
  "url": "https://github.com/sponsors/sindresorhus"
8986
  }
8987
  },
 
 
 
 
 
 
 
 
 
 
 
8988
  "node_modules/parseurl": {
8989
  "version": "1.3.3",
8990
  "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
@@ -10185,6 +10353,11 @@
10185
  "url": "https://github.com/sponsors/isaacs"
10186
  }
10187
  },
 
 
 
 
 
10188
  "node_modules/run-parallel": {
10189
  "version": "1.2.0",
10190
  "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
@@ -10281,6 +10454,17 @@
10281
  "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
10282
  "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
10283
  },
 
 
 
 
 
 
 
 
 
 
 
10284
  "node_modules/semver": {
10285
  "version": "7.6.0",
10286
  "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
@@ -10941,6 +11125,11 @@
10941
  "url": "https://github.com/sponsors/ljharb"
10942
  }
10943
  },
 
 
 
 
 
10944
  "node_modules/tar": {
10945
  "version": "6.2.1",
10946
  "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
@@ -11589,6 +11778,17 @@
11589
  "node": ">= 0.8"
11590
  }
11591
  },
 
 
 
 
 
 
 
 
 
 
 
11592
  "node_modules/walker": {
11593
  "version": "1.0.8",
11594
  "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
@@ -11622,7 +11822,6 @@
11622
  "version": "7.0.0",
11623
  "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
11624
  "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
11625
- "peer": true,
11626
  "engines": {
11627
  "node": ">=12"
11628
  }
@@ -11648,6 +11847,25 @@
11648
  "node": ">=0.8.0"
11649
  }
11650
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11651
  "node_modules/whatwg-url": {
11652
  "version": "11.0.0",
11653
  "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
@@ -11778,6 +11996,14 @@
11778
  "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
11779
  "integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
11780
  },
 
 
 
 
 
 
 
 
11781
  "node_modules/xml2js": {
11782
  "version": "0.5.0",
11783
  "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -11798,6 +12024,11 @@
11798
  "node": ">=4.0"
11799
  }
11800
  },
 
 
 
 
 
11801
  "node_modules/y18n": {
11802
  "version": "5.0.8",
11803
  "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
 
23
  "generic-pool": "^3.9.0",
24
  "htmlparser2": "^9.0.0",
25
  "jose": "^5.1.0",
26
+ "jsdom": "^24.0.0",
27
  "langdetect": "^0.2.1",
28
  "maxmind": "^4.3.18",
29
  "minio": "^7.1.3",
 
4037
  "node": ">= 8"
4038
  }
4039
  },
4040
+ "node_modules/cssstyle": {
4041
+ "version": "4.0.1",
4042
+ "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz",
4043
+ "integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==",
4044
+ "dependencies": {
4045
+ "rrweb-cssom": "^0.6.0"
4046
+ },
4047
+ "engines": {
4048
+ "node": ">=18"
4049
+ }
4050
+ },
4051
  "node_modules/data-uri-to-buffer": {
4052
  "version": "6.0.2",
4053
  "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
 
4056
  "node": ">= 14"
4057
  }
4058
  },
4059
+ "node_modules/data-urls": {
4060
+ "version": "5.0.0",
4061
+ "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz",
4062
+ "integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==",
4063
+ "dependencies": {
4064
+ "whatwg-mimetype": "^4.0.0",
4065
+ "whatwg-url": "^14.0.0"
4066
+ },
4067
+ "engines": {
4068
+ "node": ">=18"
4069
+ }
4070
+ },
4071
+ "node_modules/data-urls/node_modules/tr46": {
4072
+ "version": "5.0.0",
4073
+ "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
4074
+ "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
4075
+ "dependencies": {
4076
+ "punycode": "^2.3.1"
4077
+ },
4078
+ "engines": {
4079
+ "node": ">=18"
4080
+ }
4081
+ },
4082
+ "node_modules/data-urls/node_modules/whatwg-url": {
4083
+ "version": "14.0.0",
4084
+ "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
4085
+ "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
4086
+ "dependencies": {
4087
+ "tr46": "^5.0.0",
4088
+ "webidl-conversions": "^7.0.0"
4089
+ },
4090
+ "engines": {
4091
+ "node": ">=18"
4092
+ }
4093
+ },
4094
  "node_modules/data-view-buffer": {
4095
  "version": "1.0.1",
4096
  "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
 
4163
  }
4164
  }
4165
  },
4166
+ "node_modules/decimal.js": {
4167
+ "version": "10.4.3",
4168
+ "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz",
4169
+ "integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA=="
4170
+ },
4171
  "node_modules/decode-uri-component": {
4172
  "version": "0.2.2",
4173
  "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.2.tgz",
 
6171
  "node": ">= 0.4"
6172
  }
6173
  },
6174
+ "node_modules/html-encoding-sniffer": {
6175
+ "version": "4.0.0",
6176
+ "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
6177
+ "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
6178
+ "dependencies": {
6179
+ "whatwg-encoding": "^3.1.1"
6180
+ },
6181
+ "engines": {
6182
+ "node": ">=18"
6183
+ }
6184
+ },
6185
  "node_modules/html-escaper": {
6186
  "version": "2.0.2",
6187
  "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
 
6370
  "version": "0.6.3",
6371
  "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
6372
  "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
 
6373
  "dependencies": {
6374
  "safer-buffer": ">= 2.1.2 < 3.0.0"
6375
  },
 
6767
  "node": ">=0.10.0"
6768
  }
6769
  },
6770
+ "node_modules/is-potential-custom-element-name": {
6771
+ "version": "1.0.1",
6772
+ "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
6773
+ "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="
6774
+ },
6775
  "node_modules/is-regex": {
6776
  "version": "1.1.4",
6777
  "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
 
7640
  "node": ">=0.1.90"
7641
  }
7642
  },
7643
+ "node_modules/jsdom": {
7644
+ "version": "24.0.0",
7645
+ "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz",
7646
+ "integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==",
7647
+ "dependencies": {
7648
+ "cssstyle": "^4.0.1",
7649
+ "data-urls": "^5.0.0",
7650
+ "decimal.js": "^10.4.3",
7651
+ "form-data": "^4.0.0",
7652
+ "html-encoding-sniffer": "^4.0.0",
7653
+ "http-proxy-agent": "^7.0.0",
7654
+ "https-proxy-agent": "^7.0.2",
7655
+ "is-potential-custom-element-name": "^1.0.1",
7656
+ "nwsapi": "^2.2.7",
7657
+ "parse5": "^7.1.2",
7658
+ "rrweb-cssom": "^0.6.0",
7659
+ "saxes": "^6.0.0",
7660
+ "symbol-tree": "^3.2.4",
7661
+ "tough-cookie": "^4.1.3",
7662
+ "w3c-xmlserializer": "^5.0.0",
7663
+ "webidl-conversions": "^7.0.0",
7664
+ "whatwg-encoding": "^3.1.1",
7665
+ "whatwg-mimetype": "^4.0.0",
7666
+ "whatwg-url": "^14.0.0",
7667
+ "ws": "^8.16.0",
7668
+ "xml-name-validator": "^5.0.0"
7669
+ },
7670
+ "engines": {
7671
+ "node": ">=18"
7672
+ },
7673
+ "peerDependencies": {
7674
+ "canvas": "^2.11.2"
7675
+ },
7676
+ "peerDependenciesMeta": {
7677
+ "canvas": {
7678
+ "optional": true
7679
+ }
7680
+ }
7681
+ },
7682
+ "node_modules/jsdom/node_modules/agent-base": {
7683
+ "version": "7.1.1",
7684
+ "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz",
7685
+ "integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==",
7686
+ "dependencies": {
7687
+ "debug": "^4.3.4"
7688
+ },
7689
+ "engines": {
7690
+ "node": ">= 14"
7691
+ }
7692
+ },
7693
+ "node_modules/jsdom/node_modules/https-proxy-agent": {
7694
+ "version": "7.0.4",
7695
+ "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz",
7696
+ "integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==",
7697
+ "dependencies": {
7698
+ "agent-base": "^7.0.2",
7699
+ "debug": "4"
7700
+ },
7701
+ "engines": {
7702
+ "node": ">= 14"
7703
+ }
7704
+ },
7705
+ "node_modules/jsdom/node_modules/tr46": {
7706
+ "version": "5.0.0",
7707
+ "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.0.0.tgz",
7708
+ "integrity": "sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==",
7709
+ "dependencies": {
7710
+ "punycode": "^2.3.1"
7711
+ },
7712
+ "engines": {
7713
+ "node": ">=18"
7714
+ }
7715
+ },
7716
+ "node_modules/jsdom/node_modules/whatwg-url": {
7717
+ "version": "14.0.0",
7718
+ "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.0.0.tgz",
7719
+ "integrity": "sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==",
7720
+ "dependencies": {
7721
+ "tr46": "^5.0.0",
7722
+ "webidl-conversions": "^7.0.0"
7723
+ },
7724
+ "engines": {
7725
+ "node": ">=18"
7726
+ }
7727
+ },
7728
  "node_modules/jsesc": {
7729
  "version": "2.5.2",
7730
  "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
 
8823
  "set-blocking": "^2.0.0"
8824
  }
8825
  },
8826
+ "node_modules/nwsapi": {
8827
+ "version": "2.2.10",
8828
+ "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.10.tgz",
8829
+ "integrity": "sha512-QK0sRs7MKv0tKe1+5uZIQk/C8XGza4DAnztJG8iD+TpJIORARrCxczA738awHrZoHeTjSSoHqao2teO0dC/gFQ=="
8830
+ },
8831
  "node_modules/object-assign": {
8832
  "version": "4.1.1",
8833
  "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
 
9142
  "url": "https://github.com/sponsors/sindresorhus"
9143
  }
9144
  },
9145
+ "node_modules/parse5": {
9146
+ "version": "7.1.2",
9147
+ "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
9148
+ "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==",
9149
+ "dependencies": {
9150
+ "entities": "^4.4.0"
9151
+ },
9152
+ "funding": {
9153
+ "url": "https://github.com/inikulin/parse5?sponsor=1"
9154
+ }
9155
+ },
9156
  "node_modules/parseurl": {
9157
  "version": "1.3.3",
9158
  "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
 
10353
  "url": "https://github.com/sponsors/isaacs"
10354
  }
10355
  },
10356
+ "node_modules/rrweb-cssom": {
10357
+ "version": "0.6.0",
10358
+ "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
10359
+ "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw=="
10360
+ },
10361
  "node_modules/run-parallel": {
10362
  "version": "1.2.0",
10363
  "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
 
10454
  "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz",
10455
  "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA=="
10456
  },
10457
+ "node_modules/saxes": {
10458
+ "version": "6.0.0",
10459
+ "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz",
10460
+ "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==",
10461
+ "dependencies": {
10462
+ "xmlchars": "^2.2.0"
10463
+ },
10464
+ "engines": {
10465
+ "node": ">=v12.22.7"
10466
+ }
10467
+ },
10468
  "node_modules/semver": {
10469
  "version": "7.6.0",
10470
  "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
 
11125
  "url": "https://github.com/sponsors/ljharb"
11126
  }
11127
  },
11128
+ "node_modules/symbol-tree": {
11129
+ "version": "3.2.4",
11130
+ "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
11131
+ "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
11132
+ },
11133
  "node_modules/tar": {
11134
  "version": "6.2.1",
11135
  "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz",
 
11778
  "node": ">= 0.8"
11779
  }
11780
  },
11781
+ "node_modules/w3c-xmlserializer": {
11782
+ "version": "5.0.0",
11783
+ "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz",
11784
+ "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==",
11785
+ "dependencies": {
11786
+ "xml-name-validator": "^5.0.0"
11787
+ },
11788
+ "engines": {
11789
+ "node": ">=18"
11790
+ }
11791
+ },
11792
  "node_modules/walker": {
11793
  "version": "1.0.8",
11794
  "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
 
11822
  "version": "7.0.0",
11823
  "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
11824
  "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
 
11825
  "engines": {
11826
  "node": ">=12"
11827
  }
 
11847
  "node": ">=0.8.0"
11848
  }
11849
  },
11850
+ "node_modules/whatwg-encoding": {
11851
+ "version": "3.1.1",
11852
+ "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
11853
+ "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
11854
+ "dependencies": {
11855
+ "iconv-lite": "0.6.3"
11856
+ },
11857
+ "engines": {
11858
+ "node": ">=18"
11859
+ }
11860
+ },
11861
+ "node_modules/whatwg-mimetype": {
11862
+ "version": "4.0.0",
11863
+ "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
11864
+ "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
11865
+ "engines": {
11866
+ "node": ">=18"
11867
+ }
11868
+ },
11869
  "node_modules/whatwg-url": {
11870
  "version": "11.0.0",
11871
  "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
 
11996
  "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz",
11997
  "integrity": "sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw=="
11998
  },
11999
+ "node_modules/xml-name-validator": {
12000
+ "version": "5.0.0",
12001
+ "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz",
12002
+ "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==",
12003
+ "engines": {
12004
+ "node": ">=18"
12005
+ }
12006
+ },
12007
  "node_modules/xml2js": {
12008
  "version": "0.5.0",
12009
  "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
 
12024
  "node": ">=4.0"
12025
  }
12026
  },
12027
+ "node_modules/xmlchars": {
12028
+ "version": "2.2.0",
12029
+ "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
12030
+ "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
12031
+ },
12032
  "node_modules/y18n": {
12033
  "version": "5.0.8",
12034
  "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
backend/functions/package.json CHANGED
@@ -43,6 +43,7 @@
43
  "generic-pool": "^3.9.0",
44
  "htmlparser2": "^9.0.0",
45
  "jose": "^5.1.0",
 
46
  "langdetect": "^0.2.1",
47
  "maxmind": "^4.3.18",
48
  "minio": "^7.1.3",
 
43
  "generic-pool": "^3.9.0",
44
  "htmlparser2": "^9.0.0",
45
  "jose": "^5.1.0",
46
+ "jsdom": "^24.0.0",
47
  "langdetect": "^0.2.1",
48
  "maxmind": "^4.3.18",
49
  "minio": "^7.1.3",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -24,6 +24,10 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
24
 
25
  const md5Hasher = new HashManager('md5', 'hex');
26
 
 
 
 
 
27
  @singleton()
28
  export class CrawlerHost extends RPCHost {
29
  logger = this.globalLogger.child({ service: this.constructor.name });
@@ -31,7 +35,7 @@ export class CrawlerHost extends RPCHost {
31
  turnDownPlugins = [require('turndown-plugin-gfm').tables];
32
 
33
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
34
- cacheValidMs = 1000 * 300;
35
  urlValidMs = 1000 * 3600 * 4;
36
 
37
  indexText = `[Usage1] https://r.jina.ai/YOUR_URL
@@ -299,8 +303,13 @@ ${this.content}
299
  in: 'header',
300
  schema: { type: 'string' }
301
  },
 
 
 
 
 
302
  'X-No-Cache': {
303
- description: `Ignores internal cache if this header is specified with a value.`,
304
  in: 'header',
305
  schema: { type: 'string' }
306
  },
@@ -315,6 +324,20 @@ ${this.content}
315
  in: 'header',
316
  schema: { type: 'string' }
317
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  'X-Proxy-Url': {
319
  description: `Specifies your custom proxy if you prefer to use one. \n\n` +
320
  `Supported protocols:\n` +
@@ -426,7 +449,15 @@ ${this.content}
426
  const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
427
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
428
  const noCache = Boolean(ctx.req.get('x-no-cache'));
429
- const cacheTolerance = noCache ? 0 : this.cacheValidMs;
 
 
 
 
 
 
 
 
430
  const cookies: CookieParam[] = [];
431
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
432
  if (Array.isArray(setCookieHeaders)) {
@@ -444,10 +475,12 @@ ${this.content}
444
  }
445
  this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
446
 
447
- const crawlOpts: ScrappingOptions = {
448
  proxyUrl: ctx.req.get('x-proxy-url'),
449
  cookies,
450
- favorScreenshot: customMode === 'screenshot'
 
 
451
  };
452
 
453
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@@ -484,7 +517,7 @@ ${this.content}
484
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
485
  for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
486
  lastScrapped = scrapped;
487
- if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
488
  continue;
489
  }
490
 
@@ -506,7 +539,7 @@ ${this.content}
506
 
507
  for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
508
  lastScrapped = scrapped;
509
- if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
510
  continue;
511
  }
512
 
@@ -642,24 +675,32 @@ ${this.content}
642
  return r;
643
  }
644
 
645
- async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
646
  let cache;
647
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
648
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
649
  }
650
 
651
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
652
- yield cache.snapshot;
653
 
654
  return;
655
  }
656
 
657
  try {
 
 
 
 
 
 
 
 
658
  yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
659
  } catch (err: any) {
660
  if (cache) {
661
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
662
- yield cache.snapshot;
663
  return;
664
  }
665
  throw err;
 
24
 
25
  const md5Hasher = new HashManager('md5', 'hex');
26
 
27
+ export interface ExtraScrappingOptions extends ScrappingOptions {
28
+ targetSelector?: string;
29
+ }
30
+
31
  @singleton()
32
  export class CrawlerHost extends RPCHost {
33
  logger = this.globalLogger.child({ service: this.constructor.name });
 
35
  turnDownPlugins = [require('turndown-plugin-gfm').tables];
36
 
37
  cacheRetentionMs = 1000 * 3600 * 24 * 7;
38
+ cacheValidMs = 1000 * 3600;
39
  urlValidMs = 1000 * 3600 * 4;
40
 
41
  indexText = `[Usage1] https://r.jina.ai/YOUR_URL
 
303
  in: 'header',
304
  schema: { type: 'string' }
305
  },
306
+ 'X-Cache-Tolerance': {
307
+ description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
308
+ in: 'header',
309
+ schema: { type: 'string' }
310
+ },
311
  'X-No-Cache': {
312
+ description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
313
  in: 'header',
314
  schema: { type: 'string' }
315
  },
 
324
  in: 'header',
325
  schema: { type: 'string' }
326
  },
327
+ 'X-Wait-For-Selector': {
328
+ description: `Specifies a CSS selector to wait for the appearance of such an element before returning. \n\n` +
329
+ 'Example: `X-Wait-For-Selector: .content-block`\n'
330
+ ,
331
+ in: 'header',
332
+ schema: { type: 'string' }
333
+ },
334
+ 'X-Target-Selector': {
335
+ description: `Specifies a CSS selector for return target instead of the full html. \n\n` +
336
+ 'Implies `X-Wait-For-Selector: (same selector)`'
337
+ ,
338
+ in: 'header',
339
+ schema: { type: 'string' }
340
+ },
341
  'X-Proxy-Url': {
342
  description: `Specifies your custom proxy if you prefer to use one. \n\n` +
343
  `Supported protocols:\n` +
 
449
  const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
450
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
451
  const noCache = Boolean(ctx.req.get('x-no-cache'));
452
+ let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
453
+ if (isNaN(cacheTolerance)) {
454
+ cacheTolerance = this.cacheValidMs;
455
+ if (noCache) {
456
+ cacheTolerance = 0;
457
+ }
458
+ }
459
+ const targetSelector = ctx.req.get('x-target-selector') || undefined;
460
+ const waitForSelector = ctx.req.get('x-wait-for-selector') || targetSelector;
461
  const cookies: CookieParam[] = [];
462
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
463
  if (Array.isArray(setCookieHeaders)) {
 
475
  }
476
  this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
477
 
478
+ const crawlOpts: ExtraScrappingOptions = {
479
  proxyUrl: ctx.req.get('x-proxy-url'),
480
  cookies,
481
+ favorScreenshot: customMode === 'screenshot',
482
+ waitForSelector,
483
+ targetSelector,
484
  };
485
 
486
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
 
517
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
518
  for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
519
  lastScrapped = scrapped;
520
+ if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
521
  continue;
522
  }
523
 
 
539
 
540
  for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, cacheTolerance)) {
541
  lastScrapped = scrapped;
542
+ if (waitForSelector || !scrapped?.parsed?.content || !(scrapped.title?.trim())) {
543
  continue;
544
  }
545
 
 
675
  return r;
676
  }
677
 
678
+ async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
679
  let cache;
680
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
681
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
682
  }
683
 
684
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
685
+ yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
686
 
687
  return;
688
  }
689
 
690
  try {
691
+ if (crawlOpts?.targetSelector) {
692
+ for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
693
+ yield this.puppeteerControl.narrowSnapshot(x, crawlOpts.targetSelector);
694
+ }
695
+
696
+ return;
697
+ }
698
+
699
  yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
700
  } catch (err: any) {
701
  if (cache) {
702
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
703
+ yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts?.targetSelector);
704
  return;
705
  }
706
  throw err;
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -181,7 +181,13 @@ export class SearcherHost extends RPCHost {
181
  const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
182
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
183
  const noCache = Boolean(ctx.req.get('x-no-cache'));
184
- const pageCacheTolerance = noCache ? 0 : this.pageCacheToleranceMs;
 
 
 
 
 
 
185
  const cookies: CookieParam[] = [];
186
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
187
  if (Array.isArray(setCookieHeaders)) {
 
181
  const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
182
  const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
183
  const noCache = Boolean(ctx.req.get('x-no-cache'));
184
+ let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
185
+ if (isNaN(pageCacheTolerance)) {
186
+ pageCacheTolerance = this.pageCacheToleranceMs;
187
+ if (noCache) {
188
+ pageCacheTolerance = 0;
189
+ }
190
+ }
191
  const cookies: CookieParam[] = [];
192
  const setCookieHeaders = ctx.req.headers['x-set-cookie'];
193
  if (Array.isArray(setCookieHeaders)) {
backend/functions/src/services/puppeteer.ts CHANGED
@@ -4,6 +4,7 @@ import { container, singleton } from 'tsyringe';
4
  import genericPool from 'generic-pool';
5
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
6
  import { Logger } from '../shared/services/logger';
 
7
 
8
  import type { Browser, CookieParam, Page } from 'puppeteer';
9
  import puppeteer from 'puppeteer-extra';
@@ -11,7 +12,7 @@ import puppeteer from 'puppeteer-extra';
11
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
12
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
  import { ServiceCrashedError } from '../shared/lib/errors';
14
-
15
 
16
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
17
 
@@ -52,6 +53,7 @@ export interface ScrappingOptions {
52
  proxyUrl?: string;
53
  cookies?: CookieParam[];
54
  favorScreenshot?: boolean;
 
55
  }
56
 
57
 
@@ -142,7 +144,7 @@ export class PuppeteerControl extends AsyncService {
142
  this.browser.once('disconnected', () => {
143
  this.logger.warn(`Browser disconnected`);
144
  this.emit('crippled');
145
- process.nextTick(()=> this.serviceReady());
146
  });
147
  this.logger.info(`Browser launched: ${this.browser.process()?.pid}`);
148
 
@@ -344,6 +346,18 @@ document.addEventListener('load', handlePageLoad);
344
  { ...options, url: parsedUrl }
345
  );
346
  });
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  try {
349
  let lastHTML = snapshot?.html;
@@ -394,6 +408,49 @@ document.addEventListener('load', handlePageLoad);
394
 
395
  return true;
396
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  }
398
 
399
  const puppeteerControl = container.resolve(PuppeteerControl);
 
4
  import genericPool from 'generic-pool';
5
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
6
  import { Logger } from '../shared/services/logger';
7
+ import { JSDOM } from 'jsdom';
8
 
9
  import type { Browser, CookieParam, Page } from 'puppeteer';
10
  import puppeteer from 'puppeteer-extra';
 
12
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
13
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
14
  import { ServiceCrashedError } from '../shared/lib/errors';
15
+ import { Readability } from '@mozilla/readability';
16
 
17
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
18
 
 
53
  proxyUrl?: string;
54
  cookies?: CookieParam[];
55
  favorScreenshot?: boolean;
56
+ waitForSelector?: string;
57
  }
58
 
59
 
 
144
  this.browser.once('disconnected', () => {
145
  this.logger.warn(`Browser disconnected`);
146
  this.emit('crippled');
147
+ process.nextTick(() => this.serviceReady());
148
  });
149
  this.logger.info(`Browser launched: ${this.browser.process()?.pid}`);
150
 
 
346
  { ...options, url: parsedUrl }
347
  );
348
  });
349
+ if (options?.waitForSelector) {
350
+ page.waitForSelector(options.waitForSelector)
351
+ .then(async () => {
352
+ snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
353
+ screenshot = await page.screenshot();
354
+ finalized = true;
355
+ nextSnapshotDeferred.resolve(snapshot);
356
+ })
357
+ .catch((err) => {
358
+ this.logger.warn(`Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
359
+ });
360
+ }
361
 
362
  try {
363
  let lastHTML = snapshot?.html;
 
408
 
409
  return true;
410
  }
411
+
412
+ narrowSnapshot(snapshot: PageSnapshot | undefined, targetSelect?: string): PageSnapshot | undefined {
413
+ if (!targetSelect) {
414
+ return snapshot;
415
+ }
416
+ if (!snapshot?.html) {
417
+ return snapshot;
418
+ }
419
+
420
+ const jsdom = new JSDOM(snapshot.html, { url: snapshot.href });
421
+ const elem = jsdom.window.document.querySelector(targetSelect);
422
+
423
+ if (!elem) {
424
+ return snapshot;
425
+ }
426
+
427
+ const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href });
428
+ let parsed;
429
+ try {
430
+ parsed = new Readability(selectedJsDom.window.document).parse();
431
+ } catch (err: any) {
432
+ this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
433
+ }
434
+
435
+ // No innerText in jsdom
436
+ // https://github.com/jsdom/jsdom/issues/1245
437
+ const textContent = elem.textContent;
438
+ const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
439
+
440
+ const imageTags = Array.from(elem.querySelectorAll('img[src],img[data-src]')).map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]).flat().filter(Boolean);
441
+
442
+ const imageSet = new Set(imageTags);
443
+
444
+ const r = {
445
+ ...snapshot,
446
+ parsed,
447
+ html: elem.outerHTML,
448
+ text: cleanedText,
449
+ imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
450
+ } as PageSnapshot;
451
+
452
+ return r;
453
+ }
454
  }
455
 
456
  const puppeteerControl = container.resolve(PuppeteerControl);
backend/functions/src/types.d.ts CHANGED
@@ -7,3 +7,10 @@ declare module 'langdetect' {
7
  export function detect(text: string): DetectionResult[];
8
  export function detectOne(text: string): string | null;
9
  }
 
 
 
 
 
 
 
 
7
  export function detect(text: string): DetectionResult[];
8
  export function detectOne(text: string): string | null;
9
  }
10
+
11
+ declare module 'jsdom' {
12
+ export class JSDOM {
13
+ constructor(html: string, options?: any);
14
+ window: typeof window;
15
+ }
16
+ }