Spaces:
Build error
restructure: nolonger a firebase application (#1160)
Browse files* fix: fine allow redefining Function.prototype.toString
* wip
* wip
* wip
* wip
* wip
* wip
* wip
* fix: contentType encoding
* wip
* fix: error throwing
* wip
* fix
* wip
* fix
* fix
* fix: jsdom
* wip
* wip
* fix: links summary uniqueness
* wip
* wip
* robots-txt catch no robots.txt
* deps: remove puppeteer-extra-plugin-stealth
* fix: dont change waring type
* fix: curl
* fix: replace firebase-roundtrip-check with blackhole-detector
* fix: black hole detection
* sercher: black hole detecting
* fix: no h2c for searcher
* fix: bhd
* fix: search and crawl conflict
* fix: bhd
* fix
* fix: server script
* canvas: fixed avif issue
* logging: move some to debug
* fix
* fix: pptr declare ready only when page can be created without issues
* fix: bhd
* cd: cloud run deploy-health-check cannot complete pptr newPage
* cd: fix
* fix: curl body can be null
* fix
* fix
* fix: major fix regarding TC pdfs
* fix
* fix
* deps: fix civkit trie router issue
* fix
* boom: total restructure
* cd: fix docker ctx
* fix
* fix: switch to h2c
* cd: ensure http2
- .github/workflows/cd.yml +5 -7
- .gitignore +77 -2
- .vscode/launch.json +31 -28
- .vscode/tasks.json +3 -129
- backend/functions/Dockerfile β Dockerfile +0 -0
- README.md +0 -4
- backend/.firebaserc +0 -5
- backend/.gitignore +0 -79
- backend/firebase.json +0 -43
- backend/firestore.indexes.json +0 -19
- backend/firestore.rules +0 -32
- backend/functions/.dockerignore +0 -1
- backend/functions/.editorconfig +0 -36
- backend/functions/.env.example +0 -0
- backend/functions/.puppeteerrc.cjs +0 -9
- backend/functions/package.json +0 -93
- backend/functions/src/services/curl.ts +0 -218
- backend/functions/src/shared +0 -1
- backend/storage.rules +0 -8
- backend/functions/integrity-check.cjs β integrity-check.cjs +0 -0
- backend/functions/package-lock.json β package-lock.json +155 -83
- package.json +83 -14
- {backend/functions/public β public}/favicon.ico +0 -0
- {backend/functions/src/cloud-functions β src/api}/crawler.ts +284 -97
- {backend/functions/src/cloud-functions β src/api}/searcher-serper.ts +56 -56
- {backend/functions/src/cloud-functions β src/api}/searcher.ts +45 -47
- {backend/functions/src β src}/cloud-functions/adaptive-crawler.ts +1 -1
- {backend/functions/src β src}/cloud-functions/data-crunching.ts +1 -1
- {backend/functions/src β src}/db/adaptive-crawl-task.ts +0 -0
- {backend/functions/src β src}/db/crawled.ts +0 -0
- {backend/functions/src β src}/db/domain-blockade.ts +0 -0
- {backend/functions/src β src}/db/domain-profile.ts +1 -1
- {backend/functions/src β src}/db/img-alt.ts +0 -0
- {backend/functions/src β src}/db/pdf.ts +0 -0
- {backend/functions/src β src}/db/searched.ts +0 -0
- {backend/functions/src β src}/dto/adaptive-crawler-options.ts +0 -0
- backend/functions/src/dto/scrapping-options.ts β src/dto/crawler-options.ts +61 -46
- src/dto/jina-embeddings-auth.ts +216 -0
- {backend/functions/src β src}/fetch.d.ts +0 -0
- {backend/functions/src β src}/index.ts +0 -0
- src/lib/transform-server-event-stream.ts +169 -0
- {backend/functions/src β src}/services/alt-text.ts +0 -0
- src/services/async-context.ts +10 -0
- src/services/blackhole-detector.ts +72 -0
- {backend/functions/src β src}/services/brave-search.ts +3 -0
- src/services/cf-browser-rendering.ts +38 -0
- src/services/curl.ts +387 -0
- src/services/errors.ts +70 -0
- src/services/finalizer.ts +24 -0
- {backend/functions/src β src}/services/geoip.ts +0 -0
|
@@ -14,9 +14,6 @@ jobs:
|
|
| 14 |
concurrency:
|
| 15 |
group: ${{ github.ref_type == 'branch' && github.ref }}
|
| 16 |
cancel-in-progress: true
|
| 17 |
-
defaults:
|
| 18 |
-
run:
|
| 19 |
-
working-directory: backend/functions
|
| 20 |
permissions:
|
| 21 |
contents: read
|
| 22 |
steps:
|
|
@@ -30,6 +27,8 @@ jobs:
|
|
| 30 |
credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
|
| 31 |
- name: 'Set up Cloud SDK'
|
| 32 |
uses: 'google-github-actions/setup-gcloud@v2'
|
|
|
|
|
|
|
| 33 |
- name: "Docker auth"
|
| 34 |
run: |-
|
| 35 |
gcloud auth configure-docker us-docker.pkg.dev --quiet
|
|
@@ -40,7 +39,6 @@ jobs:
|
|
| 40 |
with:
|
| 41 |
node-version: 22.12.0
|
| 42 |
cache: npm
|
| 43 |
-
cache-dependency-path: backend/functions/package-lock.json
|
| 44 |
|
| 45 |
- name: npm install
|
| 46 |
run: npm ci
|
|
@@ -65,13 +63,13 @@ jobs:
|
|
| 65 |
id: container
|
| 66 |
uses: docker/build-push-action@v6
|
| 67 |
with:
|
| 68 |
-
context:
|
| 69 |
push: true
|
| 70 |
tags: ${{ steps.meta.outputs.tags }}
|
| 71 |
labels: ${{ steps.meta.outputs.labels }}
|
| 72 |
- name: Deploy CRAWL with Tag
|
| 73 |
run: |
|
| 74 |
-
gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
|
| 75 |
- name: Deploy SEARCH with Tag
|
| 76 |
run: |
|
| 77 |
-
gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0
|
|
|
|
| 14 |
concurrency:
|
| 15 |
group: ${{ github.ref_type == 'branch' && github.ref }}
|
| 16 |
cancel-in-progress: true
|
|
|
|
|
|
|
|
|
|
| 17 |
permissions:
|
| 18 |
contents: read
|
| 19 |
steps:
|
|
|
|
| 27 |
credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
|
| 28 |
- name: 'Set up Cloud SDK'
|
| 29 |
uses: 'google-github-actions/setup-gcloud@v2'
|
| 30 |
+
with:
|
| 31 |
+
install_components: beta
|
| 32 |
- name: "Docker auth"
|
| 33 |
run: |-
|
| 34 |
gcloud auth configure-docker us-docker.pkg.dev --quiet
|
|
|
|
| 39 |
with:
|
| 40 |
node-version: 22.12.0
|
| 41 |
cache: npm
|
|
|
|
| 42 |
|
| 43 |
- name: npm install
|
| 44 |
run: npm ci
|
|
|
|
| 63 |
id: container
|
| 64 |
uses: docker/build-push-action@v6
|
| 65 |
with:
|
| 66 |
+
context: .
|
| 67 |
push: true
|
| 68 |
tags: ${{ steps.meta.outputs.tags }}
|
| 69 |
labels: ${{ steps.meta.outputs.labels }}
|
| 70 |
- name: Deploy CRAWL with Tag
|
| 71 |
run: |
|
| 72 |
+
gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 73 |
- name: Deploy SEARCH with Tag
|
| 74 |
run: |
|
| 75 |
+
gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
|
|
@@ -1,4 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
node_modules/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
.DS_Store
|
| 3 |
-
/
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logs
|
| 2 |
+
logs
|
| 3 |
+
*.log
|
| 4 |
+
npm-debug.log*
|
| 5 |
+
yarn-debug.log*
|
| 6 |
+
yarn-error.log*
|
| 7 |
+
firebase-debug.log*
|
| 8 |
+
firebase-debug.*.log*
|
| 9 |
+
|
| 10 |
+
# Firebase cache
|
| 11 |
+
.firebase/
|
| 12 |
+
|
| 13 |
+
# Firebase config
|
| 14 |
+
|
| 15 |
+
# Uncomment this if you'd like others to create their own Firebase project.
|
| 16 |
+
# For a team working on the same Firebase project(s), it is recommended to leave
|
| 17 |
+
# it commented so all members can deploy to the same project(s) in .firebaserc.
|
| 18 |
+
# .firebaserc
|
| 19 |
+
|
| 20 |
+
# Runtime data
|
| 21 |
+
pids
|
| 22 |
+
*.pid
|
| 23 |
+
*.seed
|
| 24 |
+
*.pid.lock
|
| 25 |
+
|
| 26 |
+
# Directory for instrumented libs generated by jscoverage/JSCover
|
| 27 |
+
lib-cov
|
| 28 |
+
|
| 29 |
+
# Coverage directory used by tools like istanbul
|
| 30 |
+
coverage
|
| 31 |
+
|
| 32 |
+
# nyc test coverage
|
| 33 |
+
.nyc_output
|
| 34 |
+
|
| 35 |
+
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
|
| 36 |
+
.grunt
|
| 37 |
+
|
| 38 |
+
# Bower dependency directory (https://bower.io/)
|
| 39 |
+
bower_components
|
| 40 |
+
|
| 41 |
+
# node-waf configuration
|
| 42 |
+
.lock-wscript
|
| 43 |
+
|
| 44 |
+
# Compiled binary addons (http://nodejs.org/api/addons.html)
|
| 45 |
+
build/Release
|
| 46 |
+
|
| 47 |
+
# Dependency directories
|
| 48 |
node_modules/
|
| 49 |
+
|
| 50 |
+
# Optional npm cache directory
|
| 51 |
+
.npm
|
| 52 |
+
|
| 53 |
+
# Optional eslint cache
|
| 54 |
+
.eslintcache
|
| 55 |
+
|
| 56 |
+
# Optional REPL history
|
| 57 |
+
.node_repl_history
|
| 58 |
+
|
| 59 |
+
# Output of 'npm pack'
|
| 60 |
+
*.tgz
|
| 61 |
+
|
| 62 |
+
# Yarn Integrity file
|
| 63 |
+
.yarn-integrity
|
| 64 |
+
|
| 65 |
+
# dotenv environment variables file
|
| 66 |
+
.env
|
| 67 |
+
.secret.local
|
| 68 |
+
|
| 69 |
+
toy*.ts
|
| 70 |
+
|
| 71 |
.DS_Store
|
| 72 |
+
build/
|
| 73 |
+
.firebase-emu/
|
| 74 |
+
*.log
|
| 75 |
+
.DS_Store
|
| 76 |
+
|
| 77 |
+
*.local
|
| 78 |
+
.secret.*
|
| 79 |
+
licensed/
|
|
@@ -1,26 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"version": "0.2.0",
|
| 3 |
"configurations": [
|
| 4 |
-
{
|
| 5 |
-
"name": "Debug Fullstack: attach",
|
| 6 |
-
"request": "attach",
|
| 7 |
-
"cwd": "${workspaceFolder}/backend/functions",
|
| 8 |
-
"skipFiles": [
|
| 9 |
-
"<node_internals>/**"
|
| 10 |
-
],
|
| 11 |
-
"type": "node",
|
| 12 |
-
"preLaunchTask": "Fullstack:debug"
|
| 13 |
-
},
|
| 14 |
-
{
|
| 15 |
-
"name": "Debug Fullstack: attach: with proxy",
|
| 16 |
-
"request": "attach",
|
| 17 |
-
"cwd": "${workspaceFolder}/backend/functions",
|
| 18 |
-
"skipFiles": [
|
| 19 |
-
"<node_internals>/**"
|
| 20 |
-
],
|
| 21 |
-
"type": "node",
|
| 22 |
-
"preLaunchTask": "Fullstack:debug:with-proxy"
|
| 23 |
-
},
|
| 24 |
{
|
| 25 |
"name": "Attach",
|
| 26 |
"port": 9229,
|
|
@@ -40,21 +20,44 @@
|
|
| 40 |
"type": "node"
|
| 41 |
},
|
| 42 |
{
|
| 43 |
-
"name": "Debug
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"request": "launch",
|
| 45 |
"runtimeArgs": [
|
| 46 |
-
"
|
| 47 |
-
"--import=../.firebase-emu",
|
| 48 |
-
"--export-on-exit=../.firebase-emu",
|
| 49 |
],
|
| 50 |
-
"
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"skipFiles": [
|
| 53 |
"<node_internals>/**"
|
| 54 |
],
|
| 55 |
"type": "node",
|
| 56 |
-
"
|
| 57 |
-
"
|
|
|
|
| 58 |
},
|
| 59 |
]
|
| 60 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"version": "0.2.0",
|
| 3 |
"configurations": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
{
|
| 5 |
"name": "Attach",
|
| 6 |
"port": 9229,
|
|
|
|
| 20 |
"type": "node"
|
| 21 |
},
|
| 22 |
{
|
| 23 |
+
"name": "Debug Stand Alone Crawl",
|
| 24 |
+
"request": "launch",
|
| 25 |
+
"runtimeArgs": [
|
| 26 |
+
"--env-file=.secret.local",
|
| 27 |
+
],
|
| 28 |
+
"env": {
|
| 29 |
+
"GCLOUD_PROJECT": "reader-6b7dc",
|
| 30 |
+
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
| 31 |
+
},
|
| 32 |
+
"cwd": "${workspaceFolder}",
|
| 33 |
+
"program": "build/stand-alone/crawl.js",
|
| 34 |
+
"skipFiles": [
|
| 35 |
+
"<node_internals>/**"
|
| 36 |
+
],
|
| 37 |
+
"type": "node",
|
| 38 |
+
"outputCapture": "std",
|
| 39 |
+
"preLaunchTask": "Backend:build:watch",
|
| 40 |
+
"killBehavior": "forceful"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "Debug Stand Alone Search",
|
| 44 |
"request": "launch",
|
| 45 |
"runtimeArgs": [
|
| 46 |
+
"--env-file=.secret.local",
|
|
|
|
|
|
|
| 47 |
],
|
| 48 |
+
"env": {
|
| 49 |
+
"GCLOUD_PROJECT": "reader-6b7dc",
|
| 50 |
+
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
| 51 |
+
},
|
| 52 |
+
"cwd": "${workspaceFolder}",
|
| 53 |
+
"program": "build/stand-alone/search.js",
|
| 54 |
"skipFiles": [
|
| 55 |
"<node_internals>/**"
|
| 56 |
],
|
| 57 |
"type": "node",
|
| 58 |
+
"outputCapture": "std",
|
| 59 |
+
"preLaunchTask": "Backend:build:watch",
|
| 60 |
+
"killBehavior": "forceful"
|
| 61 |
},
|
| 62 |
]
|
| 63 |
}
|
|
@@ -6,29 +6,18 @@
|
|
| 6 |
"script": "build",
|
| 7 |
"group": "build",
|
| 8 |
"options": {
|
| 9 |
-
"cwd": "${workspaceFolder}
|
| 10 |
},
|
| 11 |
"problemMatcher": [],
|
| 12 |
"label": "Backend:rebuild",
|
| 13 |
"detail": "Backend:rebuild"
|
| 14 |
},
|
| 15 |
-
{
|
| 16 |
-
"type": "npm",
|
| 17 |
-
"script": "emu:reset",
|
| 18 |
-
"group": "build",
|
| 19 |
-
"options": {
|
| 20 |
-
"cwd": "${workspaceFolder}/backend/functions"
|
| 21 |
-
},
|
| 22 |
-
"problemMatcher": [],
|
| 23 |
-
"label": "Backend:reset-emulator",
|
| 24 |
-
"detail": "Backend:reset-emulator"
|
| 25 |
-
},
|
| 26 |
{
|
| 27 |
"type": "typescript",
|
| 28 |
"options": {
|
| 29 |
-
"cwd": "${workspaceFolder}
|
| 30 |
},
|
| 31 |
-
"tsconfig": "
|
| 32 |
"option": "watch",
|
| 33 |
"isBackground": true,
|
| 34 |
"problemMatcher": [
|
|
@@ -36,121 +25,6 @@
|
|
| 36 |
],
|
| 37 |
"group": "build",
|
| 38 |
"label": "Backend:build:watch"
|
| 39 |
-
},
|
| 40 |
-
{
|
| 41 |
-
"type": "npm",
|
| 42 |
-
"script": "emu:debug",
|
| 43 |
-
"group": "none",
|
| 44 |
-
"options": {
|
| 45 |
-
"cwd": "${workspaceFolder}/backend/functions"
|
| 46 |
-
},
|
| 47 |
-
"problemMatcher": [
|
| 48 |
-
{
|
| 49 |
-
"base": "$tsc",
|
| 50 |
-
"background": {
|
| 51 |
-
"activeOnStart": false,
|
| 52 |
-
"beginsPattern": "shutdown requested|Starting emulators",
|
| 53 |
-
"endsPattern": "Debugger listening"
|
| 54 |
-
}
|
| 55 |
-
}
|
| 56 |
-
],
|
| 57 |
-
"label": "Backend:start-emulator-debug",
|
| 58 |
-
"detail": "Backend:start-emulator-debug",
|
| 59 |
-
"dependsOn": [
|
| 60 |
-
"Backend:build:watch"
|
| 61 |
-
],
|
| 62 |
-
"isBackground": true,
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"type": "npm",
|
| 66 |
-
"script": "dev",
|
| 67 |
-
"options": {
|
| 68 |
-
"cwd": "${workspaceFolder}/webapp",
|
| 69 |
-
},
|
| 70 |
-
"group": "build",
|
| 71 |
-
"label": "Frontend:start:dev",
|
| 72 |
-
"detail": "Frontend:start:dev",
|
| 73 |
-
"isBackground": true,
|
| 74 |
-
"problemMatcher": {
|
| 75 |
-
"base": "$vite",
|
| 76 |
-
"background": {
|
| 77 |
-
"activeOnStart": true,
|
| 78 |
-
"endsPattern": "OK",
|
| 79 |
-
"beginsPattern": "vite"
|
| 80 |
-
}
|
| 81 |
-
},
|
| 82 |
-
},
|
| 83 |
-
{
|
| 84 |
-
"type": "npm",
|
| 85 |
-
"script": "dev",
|
| 86 |
-
"options": {
|
| 87 |
-
"cwd": "${workspaceFolder}/webapp",
|
| 88 |
-
"env": {
|
| 89 |
-
"FIREBASE_EMULATE": "true",
|
| 90 |
-
}
|
| 91 |
-
},
|
| 92 |
-
"group": "build",
|
| 93 |
-
"label": "Frontend:start:emu",
|
| 94 |
-
"detail": "Frontend:start:emu",
|
| 95 |
-
"isBackground": true,
|
| 96 |
-
"problemMatcher": {
|
| 97 |
-
"base": "$vite",
|
| 98 |
-
"background": {
|
| 99 |
-
"activeOnStart": true,
|
| 100 |
-
"endsPattern": "OK",
|
| 101 |
-
"beginsPattern": "vite"
|
| 102 |
-
}
|
| 103 |
-
},
|
| 104 |
-
},
|
| 105 |
-
{
|
| 106 |
-
"type": "npm",
|
| 107 |
-
"script": "emu:debug2",
|
| 108 |
-
"group": "none",
|
| 109 |
-
"options": {
|
| 110 |
-
"cwd": "${workspaceFolder}/backend/functions",
|
| 111 |
-
"env": {
|
| 112 |
-
"https_proxy": "http://127.0.0.1:7890",
|
| 113 |
-
"http_proxy": "http://127.0.0.1:7890",
|
| 114 |
-
"all_proxy": "socks5://127.0.0.1:7890"
|
| 115 |
-
}
|
| 116 |
-
},
|
| 117 |
-
"problemMatcher": [
|
| 118 |
-
{
|
| 119 |
-
"base": "$tsc",
|
| 120 |
-
"background": {
|
| 121 |
-
"activeOnStart": false,
|
| 122 |
-
"beginsPattern": "shutdown requested|Starting emulators",
|
| 123 |
-
"endsPattern": "Debugger listening"
|
| 124 |
-
}
|
| 125 |
-
}
|
| 126 |
-
],
|
| 127 |
-
"label": "Backend:start-emulator-debug:with-proxy",
|
| 128 |
-
"detail": "Backend:start-emulator-debug:with-proxy",
|
| 129 |
-
"dependsOn": [
|
| 130 |
-
"Backend:build:watch"
|
| 131 |
-
],
|
| 132 |
-
"isBackground": true,
|
| 133 |
-
},
|
| 134 |
-
{
|
| 135 |
-
"label": "Fullstack:prepare",
|
| 136 |
-
"dependsOn": [
|
| 137 |
-
"Frontend:start:emu",
|
| 138 |
-
"Backend:build:watch",
|
| 139 |
-
],
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"label": "Fullstack:debug",
|
| 143 |
-
"dependsOn": [
|
| 144 |
-
// "Frontend:start:emu",
|
| 145 |
-
"Backend:start-emulator-debug",
|
| 146 |
-
],
|
| 147 |
-
},
|
| 148 |
-
{
|
| 149 |
-
"label": "Fullstack:debug:with-proxy",
|
| 150 |
-
"dependsOn": [
|
| 151 |
-
"Frontend:start:emu",
|
| 152 |
-
"Backend:start-emulator-debug:with-proxy",
|
| 153 |
-
],
|
| 154 |
}
|
| 155 |
]
|
| 156 |
}
|
|
|
|
| 6 |
"script": "build",
|
| 7 |
"group": "build",
|
| 8 |
"options": {
|
| 9 |
+
"cwd": "${workspaceFolder}"
|
| 10 |
},
|
| 11 |
"problemMatcher": [],
|
| 12 |
"label": "Backend:rebuild",
|
| 13 |
"detail": "Backend:rebuild"
|
| 14 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
{
|
| 16 |
"type": "typescript",
|
| 17 |
"options": {
|
| 18 |
+
"cwd": "${workspaceFolder}"
|
| 19 |
},
|
| 20 |
+
"tsconfig": "tsconfig.json",
|
| 21 |
"option": "watch",
|
| 22 |
"isBackground": true,
|
| 23 |
"problemMatcher": [
|
|
|
|
| 25 |
],
|
| 26 |
"group": "build",
|
| 27 |
"label": "Backend:build:watch"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
}
|
| 29 |
]
|
| 30 |
}
|
|
File without changes
|
|
@@ -158,13 +158,9 @@ curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.or
|
|
| 158 |
|
| 159 |
You will need the following tools to run the project:
|
| 160 |
- Node v18 (The build fails for Node version >18)
|
| 161 |
-
- Firebase CLI (`npm install -g firebase-tools`)
|
| 162 |
-
|
| 163 |
-
For backend, go to the `backend/functions` directory and install the npm dependencies.
|
| 164 |
|
| 165 |
```bash
|
| 166 |
git clone git@github.com:jina-ai/reader.git
|
| 167 |
-
cd backend/functions
|
| 168 |
npm install
|
| 169 |
```
|
| 170 |
|
|
|
|
| 158 |
|
| 159 |
You will need the following tools to run the project:
|
| 160 |
- Node v18 (The build fails for Node version >18)
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
```bash
|
| 163 |
git clone git@github.com:jina-ai/reader.git
|
|
|
|
| 164 |
npm install
|
| 165 |
```
|
| 166 |
|
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"projects": {
|
| 3 |
-
"default": "reader-6b7dc"
|
| 4 |
-
}
|
| 5 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,79 +0,0 @@
|
|
| 1 |
-
# Logs
|
| 2 |
-
logs
|
| 3 |
-
*.log
|
| 4 |
-
npm-debug.log*
|
| 5 |
-
yarn-debug.log*
|
| 6 |
-
yarn-error.log*
|
| 7 |
-
firebase-debug.log*
|
| 8 |
-
firebase-debug.*.log*
|
| 9 |
-
|
| 10 |
-
# Firebase cache
|
| 11 |
-
.firebase/
|
| 12 |
-
|
| 13 |
-
# Firebase config
|
| 14 |
-
|
| 15 |
-
# Uncomment this if you'd like others to create their own Firebase project.
|
| 16 |
-
# For a team working on the same Firebase project(s), it is recommended to leave
|
| 17 |
-
# it commented so all members can deploy to the same project(s) in .firebaserc.
|
| 18 |
-
# .firebaserc
|
| 19 |
-
|
| 20 |
-
# Runtime data
|
| 21 |
-
pids
|
| 22 |
-
*.pid
|
| 23 |
-
*.seed
|
| 24 |
-
*.pid.lock
|
| 25 |
-
|
| 26 |
-
# Directory for instrumented libs generated by jscoverage/JSCover
|
| 27 |
-
lib-cov
|
| 28 |
-
|
| 29 |
-
# Coverage directory used by tools like istanbul
|
| 30 |
-
coverage
|
| 31 |
-
|
| 32 |
-
# nyc test coverage
|
| 33 |
-
.nyc_output
|
| 34 |
-
|
| 35 |
-
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
|
| 36 |
-
.grunt
|
| 37 |
-
|
| 38 |
-
# Bower dependency directory (https://bower.io/)
|
| 39 |
-
bower_components
|
| 40 |
-
|
| 41 |
-
# node-waf configuration
|
| 42 |
-
.lock-wscript
|
| 43 |
-
|
| 44 |
-
# Compiled binary addons (http://nodejs.org/api/addons.html)
|
| 45 |
-
build/Release
|
| 46 |
-
|
| 47 |
-
# Dependency directories
|
| 48 |
-
node_modules/
|
| 49 |
-
|
| 50 |
-
# Optional npm cache directory
|
| 51 |
-
.npm
|
| 52 |
-
|
| 53 |
-
# Optional eslint cache
|
| 54 |
-
.eslintcache
|
| 55 |
-
|
| 56 |
-
# Optional REPL history
|
| 57 |
-
.node_repl_history
|
| 58 |
-
|
| 59 |
-
# Output of 'npm pack'
|
| 60 |
-
*.tgz
|
| 61 |
-
|
| 62 |
-
# Yarn Integrity file
|
| 63 |
-
.yarn-integrity
|
| 64 |
-
|
| 65 |
-
# dotenv environment variables file
|
| 66 |
-
.env
|
| 67 |
-
.secret.local
|
| 68 |
-
|
| 69 |
-
toy*.ts
|
| 70 |
-
|
| 71 |
-
.DS_Store
|
| 72 |
-
build/
|
| 73 |
-
.firebase-emu/
|
| 74 |
-
*.log
|
| 75 |
-
.DS_Store
|
| 76 |
-
|
| 77 |
-
*.local
|
| 78 |
-
.secret.*
|
| 79 |
-
licensed/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"firestore": {
|
| 3 |
-
"rules": "firestore.rules",
|
| 4 |
-
"indexes": "firestore.indexes.json"
|
| 5 |
-
},
|
| 6 |
-
"functions": [
|
| 7 |
-
{
|
| 8 |
-
"source": "functions",
|
| 9 |
-
"codebase": "default",
|
| 10 |
-
"ignore": [
|
| 11 |
-
"node_modules",
|
| 12 |
-
"src",
|
| 13 |
-
".git",
|
| 14 |
-
"*.log",
|
| 15 |
-
"*.local",
|
| 16 |
-
".secret.*",
|
| 17 |
-
".firebase-emu"
|
| 18 |
-
],
|
| 19 |
-
"predeploy": [
|
| 20 |
-
"npm --prefix \"$RESOURCE_DIR\" run build:clean",
|
| 21 |
-
"npm --prefix \"$RESOURCE_DIR\" run build"
|
| 22 |
-
]
|
| 23 |
-
}
|
| 24 |
-
],
|
| 25 |
-
"storage": {
|
| 26 |
-
"rules": "storage.rules"
|
| 27 |
-
},
|
| 28 |
-
"emulators": {
|
| 29 |
-
"ui": {
|
| 30 |
-
"enabled": true
|
| 31 |
-
},
|
| 32 |
-
"singleProjectMode": true,
|
| 33 |
-
"functions": {
|
| 34 |
-
"port": 5001
|
| 35 |
-
},
|
| 36 |
-
"firestore": {
|
| 37 |
-
"port": 9098
|
| 38 |
-
},
|
| 39 |
-
"storage": {
|
| 40 |
-
"port": 9097
|
| 41 |
-
}
|
| 42 |
-
}
|
| 43 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"indexes": [
|
| 3 |
-
{
|
| 4 |
-
"collectionGroup": "prompts",
|
| 5 |
-
"queryScope": "COLLECTION_GROUP",
|
| 6 |
-
"fields": [
|
| 7 |
-
{
|
| 8 |
-
"fieldPath": "id",
|
| 9 |
-
"order": "ASCENDING"
|
| 10 |
-
},
|
| 11 |
-
{
|
| 12 |
-
"fieldPath": "isPublic",
|
| 13 |
-
"order": "ASCENDING"
|
| 14 |
-
}
|
| 15 |
-
]
|
| 16 |
-
}
|
| 17 |
-
],
|
| 18 |
-
"fieldOverrides": []
|
| 19 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
rules_version = '2';
|
| 2 |
-
service cloud.firestore {
|
| 3 |
-
match /databases/{database}/documents {
|
| 4 |
-
// match /questions/{document=**} {
|
| 5 |
-
// allow read: if request.auth != null
|
| 6 |
-
// }
|
| 7 |
-
|
| 8 |
-
// match /answers/{userId}/profiles/default {
|
| 9 |
-
// allow read, write: if request.auth != null && request.auth.uid == userId
|
| 10 |
-
// }
|
| 11 |
-
|
| 12 |
-
match /credits/{userId}/{document=**} {
|
| 13 |
-
allow read: if request.auth != null && request.auth.uid == userId
|
| 14 |
-
}
|
| 15 |
-
|
| 16 |
-
match /users/{userId}/prompts/{document=**} {
|
| 17 |
-
allow read: if request.auth != null && request.auth.uid == userId
|
| 18 |
-
}
|
| 19 |
-
|
| 20 |
-
// match /users/{userId}/profiles/{document=**} {
|
| 21 |
-
// allow read: if request.auth != null && request.auth.uid == userId
|
| 22 |
-
// }
|
| 23 |
-
|
| 24 |
-
match /users/{userId}/creditHistory/{document=**} {
|
| 25 |
-
allow read: if request.auth != null && request.auth.uid == userId
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
-
match /{document=**} {
|
| 29 |
-
allow read, write: if false;
|
| 30 |
-
}
|
| 31 |
-
}
|
| 32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1 +0,0 @@
|
|
| 1 |
-
node_modules/
|
|
|
|
|
|
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
root = true
|
| 2 |
-
|
| 3 |
-
[*]
|
| 4 |
-
end_of_line = lf
|
| 5 |
-
charset = utf-8
|
| 6 |
-
indent_style = space
|
| 7 |
-
insert_final_newline = true
|
| 8 |
-
trim_trailing_whitespace = true
|
| 9 |
-
indent_size = 4
|
| 10 |
-
quote_type = single
|
| 11 |
-
max_line_length = 120
|
| 12 |
-
|
| 13 |
-
[*.py]
|
| 14 |
-
indent_size = 4
|
| 15 |
-
|
| 16 |
-
[*.ts]
|
| 17 |
-
indent_size = 4
|
| 18 |
-
|
| 19 |
-
[*.js]
|
| 20 |
-
indent_size = 2
|
| 21 |
-
|
| 22 |
-
[*.vue]
|
| 23 |
-
indent_size = 2
|
| 24 |
-
|
| 25 |
-
[*.*sx]
|
| 26 |
-
indent_size = 2
|
| 27 |
-
|
| 28 |
-
[*.*ml]
|
| 29 |
-
indent_size = 2
|
| 30 |
-
|
| 31 |
-
[*.json]
|
| 32 |
-
indent_size = 2
|
| 33 |
-
|
| 34 |
-
[*.md]
|
| 35 |
-
indent_size = 2
|
| 36 |
-
trim_trailing_whitespace = false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
File without changes
|
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
const { join } = require('path');
|
| 2 |
-
|
| 3 |
-
/**
|
| 4 |
-
* @type {import("puppeteer").Configuration}
|
| 5 |
-
*/
|
| 6 |
-
module.exports = {
|
| 7 |
-
// Changes the cache location for Puppeteer.
|
| 8 |
-
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
| 9 |
-
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,93 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"name": "reader",
|
| 3 |
-
"scripts": {
|
| 4 |
-
"lint": "eslint --ext .js,.ts .",
|
| 5 |
-
"build": "node ./integrity-check.cjs && tsc -p .",
|
| 6 |
-
"build:watch": "tsc --watch",
|
| 7 |
-
"build:clean": "rm -rf ./build",
|
| 8 |
-
"shell": "npm run build && firebase functions:shell",
|
| 9 |
-
"emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu",
|
| 10 |
-
"emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../",
|
| 11 |
-
"emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit",
|
| 12 |
-
"emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
|
| 13 |
-
"emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
|
| 14 |
-
"emu:kill": "killall java",
|
| 15 |
-
"serve": "npm run build && npm run emu:start",
|
| 16 |
-
"debug": "npm run build && npm run emu:start -- --inspect-functions",
|
| 17 |
-
"from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit",
|
| 18 |
-
"from-preset": "npm run build && npm run emu:reset && npm run emu:start",
|
| 19 |
-
"start": "npm run shell",
|
| 20 |
-
"deploy": "firebase deploy --only functions",
|
| 21 |
-
"logs": "firebase functions:log",
|
| 22 |
-
"gcp-build": "node node_modules/puppeteer/install.mjs"
|
| 23 |
-
},
|
| 24 |
-
"engines": {
|
| 25 |
-
"node": "20"
|
| 26 |
-
},
|
| 27 |
-
"main": "build/index.js",
|
| 28 |
-
"dependencies": {
|
| 29 |
-
"@esm2cjs/normalize-url": "^8.0.0",
|
| 30 |
-
"@google-cloud/translate": "^8.2.0",
|
| 31 |
-
"@mozilla/readability": "^0.5.0",
|
| 32 |
-
"@napi-rs/canvas": "^0.1.67",
|
| 33 |
-
"@types/turndown": "^5.0.4",
|
| 34 |
-
"@xmldom/xmldom": "^0.9.3",
|
| 35 |
-
"archiver": "^6.0.1",
|
| 36 |
-
"axios": "^1.3.3",
|
| 37 |
-
"bcrypt": "^5.1.0",
|
| 38 |
-
"busboy": "^1.6.0",
|
| 39 |
-
"civkit": "^0.8.3-3e69606",
|
| 40 |
-
"core-js": "^3.37.1",
|
| 41 |
-
"cors": "^2.8.5",
|
| 42 |
-
"dayjs": "^1.11.9",
|
| 43 |
-
"express": "^4.19.2",
|
| 44 |
-
"firebase-admin": "^12.1.0",
|
| 45 |
-
"firebase-functions": "^6.1.1",
|
| 46 |
-
"htmlparser2": "^9.0.0",
|
| 47 |
-
"jose": "^5.1.0",
|
| 48 |
-
"langdetect": "^0.2.1",
|
| 49 |
-
"linkedom": "^0.18.4",
|
| 50 |
-
"maxmind": "^4.3.18",
|
| 51 |
-
"minio": "^7.1.3",
|
| 52 |
-
"node-libcurl": "^4.1.0",
|
| 53 |
-
"openai": "^4.20.0",
|
| 54 |
-
"pdfjs-dist": "^4.2.67",
|
| 55 |
-
"puppeteer": "^23.3.0",
|
| 56 |
-
"puppeteer-extra": "^3.3.6",
|
| 57 |
-
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 58 |
-
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
| 59 |
-
"puppeteer-page-proxy": "^1.3.0",
|
| 60 |
-
"robots-parser": "^3.0.1",
|
| 61 |
-
"set-cookie-parser": "^2.6.0",
|
| 62 |
-
"simple-zstd": "^1.4.2",
|
| 63 |
-
"stripe": "^11.11.0",
|
| 64 |
-
"tiktoken": "^1.0.16",
|
| 65 |
-
"tld-extract": "^2.1.0",
|
| 66 |
-
"turndown": "^7.1.3",
|
| 67 |
-
"turndown-plugin-gfm": "^1.0.2",
|
| 68 |
-
"undici": "^5.24.0"
|
| 69 |
-
},
|
| 70 |
-
"devDependencies": {
|
| 71 |
-
"@types/archiver": "^5.3.4",
|
| 72 |
-
"@types/bcrypt": "^5.0.0",
|
| 73 |
-
"@types/busboy": "^1.5.4",
|
| 74 |
-
"@types/cors": "^2.8.17",
|
| 75 |
-
"@types/generic-pool": "^3.8.1",
|
| 76 |
-
"@types/node": "^20.14.13",
|
| 77 |
-
"@types/set-cookie-parser": "^2.4.7",
|
| 78 |
-
"@types/xmldom": "^0.1.34",
|
| 79 |
-
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
| 80 |
-
"@typescript-eslint/parser": "^5.12.0",
|
| 81 |
-
"eslint": "^8.9.0",
|
| 82 |
-
"eslint-config-google": "^0.14.0",
|
| 83 |
-
"eslint-plugin-import": "^2.25.4",
|
| 84 |
-
"firebase-functions-test": "^3.0.0",
|
| 85 |
-
"pino-pretty": "^13.0.0",
|
| 86 |
-
"replicate": "^0.16.1",
|
| 87 |
-
"typescript": "^5.5.4"
|
| 88 |
-
},
|
| 89 |
-
"private": true,
|
| 90 |
-
"exports": {
|
| 91 |
-
".": "./build/index.js"
|
| 92 |
-
}
|
| 93 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,218 +0,0 @@
|
|
| 1 |
-
import { marshalErrorLike } from 'civkit/lang';
|
| 2 |
-
import { AsyncService } from 'civkit/async-service';
|
| 3 |
-
import { singleton } from 'tsyringe';
|
| 4 |
-
|
| 5 |
-
import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
|
| 6 |
-
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
| 7 |
-
import { Logger } from '../shared/services/logger';
|
| 8 |
-
import { JSDomControl } from './jsdom';
|
| 9 |
-
import { AssertionFailureError, FancyFile } from 'civkit';
|
| 10 |
-
import { TempFileManager } from '../shared';
|
| 11 |
-
import { readFile } from 'fs/promises';
|
| 12 |
-
import { pathToFileURL } from 'url';
|
| 13 |
-
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
|
| 14 |
-
import { ZSTDDecompress } from 'simple-zstd';
|
| 15 |
-
|
| 16 |
-
@singleton()
|
| 17 |
-
export class CurlControl extends AsyncService {
|
| 18 |
-
|
| 19 |
-
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 20 |
-
|
| 21 |
-
constructor(
|
| 22 |
-
protected globalLogger: Logger,
|
| 23 |
-
protected jsdomControl: JSDomControl,
|
| 24 |
-
protected tempFileManager: TempFileManager,
|
| 25 |
-
) {
|
| 26 |
-
super(...arguments);
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
override async init() {
|
| 30 |
-
await this.dependencyReady();
|
| 31 |
-
|
| 32 |
-
this.emit('ready');
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
-
curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
|
| 36 |
-
const mixinHeaders = {
|
| 37 |
-
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
|
| 38 |
-
'sec-ch-ua-mobile': '?0',
|
| 39 |
-
'sec-ch-ua-platform': 'Windows',
|
| 40 |
-
'Upgrade-Insecure-Requests': '1',
|
| 41 |
-
'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
|
| 42 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
| 43 |
-
'Sec-Fetch-Site': 'none',
|
| 44 |
-
'Sec-Fetch-Mode': 'navigate',
|
| 45 |
-
'Sec-Fetch-User': '?1',
|
| 46 |
-
'Sec-Fetch-Dest': 'document',
|
| 47 |
-
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
| 48 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
| 49 |
-
};
|
| 50 |
-
|
| 51 |
-
curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
|
| 52 |
-
|
| 53 |
-
return curl;
|
| 54 |
-
}
|
| 55 |
-
|
| 56 |
-
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
|
| 57 |
-
const snapshot = {
|
| 58 |
-
href: urlToCrawl.toString(),
|
| 59 |
-
html: '',
|
| 60 |
-
title: '',
|
| 61 |
-
text: '',
|
| 62 |
-
} as PageSnapshot;
|
| 63 |
-
|
| 64 |
-
let contentType = '';
|
| 65 |
-
const result = await new Promise<{
|
| 66 |
-
statusCode: number,
|
| 67 |
-
data?: FancyFile,
|
| 68 |
-
headers: Buffer | HeaderInfo[],
|
| 69 |
-
}>((resolve, reject) => {
|
| 70 |
-
const curl = new Curl();
|
| 71 |
-
curl.enable(CurlFeature.StreamResponse);
|
| 72 |
-
curl.setOpt('URL', urlToCrawl.toString());
|
| 73 |
-
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
| 74 |
-
|
| 75 |
-
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
|
| 76 |
-
|
| 77 |
-
if (crawlOpts?.overrideUserAgent) {
|
| 78 |
-
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
|
| 82 |
-
// if (crawlOpts?.extraHeaders) {
|
| 83 |
-
// curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
| 84 |
-
// }
|
| 85 |
-
if (crawlOpts?.proxyUrl) {
|
| 86 |
-
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
| 87 |
-
}
|
| 88 |
-
if (crawlOpts?.cookies?.length) {
|
| 89 |
-
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`);
|
| 90 |
-
curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; '));
|
| 91 |
-
}
|
| 92 |
-
if (crawlOpts?.referer) {
|
| 93 |
-
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
| 94 |
-
}
|
| 95 |
-
|
| 96 |
-
curl.on('end', (statusCode, _data, headers) => {
|
| 97 |
-
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
|
| 98 |
-
curl.close();
|
| 99 |
-
});
|
| 100 |
-
|
| 101 |
-
curl.on('error', (err) => {
|
| 102 |
-
curl.close();
|
| 103 |
-
this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
|
| 104 |
-
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
|
| 105 |
-
});
|
| 106 |
-
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
|
| 107 |
-
let status = -1;
|
| 108 |
-
let contentEncoding = '';
|
| 109 |
-
curl.on('stream', (stream, statusCode, headers) => {
|
| 110 |
-
status = statusCode;
|
| 111 |
-
const lastResHeaders = headers[headers.length - 1];
|
| 112 |
-
for (const [k, v] of Object.entries(lastResHeaders)) {
|
| 113 |
-
const kl = k.toLowerCase();
|
| 114 |
-
if (kl === 'content-type') {
|
| 115 |
-
contentType = v.toLowerCase();
|
| 116 |
-
}
|
| 117 |
-
if (kl === 'content-encoding') {
|
| 118 |
-
contentEncoding = v.toLowerCase();
|
| 119 |
-
}
|
| 120 |
-
if (contentType && contentEncoding) {
|
| 121 |
-
break;
|
| 122 |
-
}
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
if (!contentType) {
|
| 126 |
-
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
|
| 127 |
-
stream.destroy();
|
| 128 |
-
return;
|
| 129 |
-
}
|
| 130 |
-
if (contentType.startsWith('image/')) {
|
| 131 |
-
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
|
| 132 |
-
stream.destroy();
|
| 133 |
-
resolve({
|
| 134 |
-
statusCode: status,
|
| 135 |
-
headers,
|
| 136 |
-
});
|
| 137 |
-
return;
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
-
switch (contentEncoding) {
|
| 141 |
-
case 'gzip': {
|
| 142 |
-
const decompressed = createGunzip();
|
| 143 |
-
stream.pipe(decompressed);
|
| 144 |
-
stream = decompressed;
|
| 145 |
-
break;
|
| 146 |
-
}
|
| 147 |
-
case 'deflate': {
|
| 148 |
-
const decompressed = createInflate();
|
| 149 |
-
stream.pipe(decompressed);
|
| 150 |
-
stream = decompressed;
|
| 151 |
-
break;
|
| 152 |
-
}
|
| 153 |
-
case 'br': {
|
| 154 |
-
const decompressed = createBrotliDecompress();
|
| 155 |
-
stream.pipe(decompressed);
|
| 156 |
-
stream = decompressed;
|
| 157 |
-
break;
|
| 158 |
-
}
|
| 159 |
-
case 'zstd': {
|
| 160 |
-
const decompressed = ZSTDDecompress();
|
| 161 |
-
stream.pipe(decompressed);
|
| 162 |
-
stream = decompressed;
|
| 163 |
-
break;
|
| 164 |
-
}
|
| 165 |
-
default: {
|
| 166 |
-
break;
|
| 167 |
-
}
|
| 168 |
-
}
|
| 169 |
-
|
| 170 |
-
const fpath = this.tempFileManager.alloc();
|
| 171 |
-
const fancyFile = FancyFile.auto(stream, fpath);
|
| 172 |
-
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
| 173 |
-
resolve({
|
| 174 |
-
statusCode: status,
|
| 175 |
-
data: fancyFile,
|
| 176 |
-
headers,
|
| 177 |
-
});
|
| 178 |
-
});
|
| 179 |
-
|
| 180 |
-
curl.perform();
|
| 181 |
-
});
|
| 182 |
-
|
| 183 |
-
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
| 184 |
-
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
| 185 |
-
}
|
| 186 |
-
|
| 187 |
-
if (contentType === 'application/octet-stream') {
|
| 188 |
-
// Content declared as binary is same as unknown.
|
| 189 |
-
contentType = '';
|
| 190 |
-
}
|
| 191 |
-
|
| 192 |
-
if (result.data) {
|
| 193 |
-
const mimeType: string = contentType || await result.data.mimeType;
|
| 194 |
-
if (mimeType.startsWith('text/html')) {
|
| 195 |
-
if ((await result.data.size) > 1024 * 1024 * 32) {
|
| 196 |
-
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
| 197 |
-
}
|
| 198 |
-
snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
| 199 |
-
} else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
|
| 200 |
-
if ((await result.data.size) > 1024 * 1024 * 32) {
|
| 201 |
-
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
| 202 |
-
}
|
| 203 |
-
snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
| 204 |
-
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
| 205 |
-
} else if (mimeType.startsWith('application/pdf')) {
|
| 206 |
-
snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
|
| 207 |
-
} else {
|
| 208 |
-
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
|
| 209 |
-
}
|
| 210 |
-
}
|
| 211 |
-
|
| 212 |
-
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
| 213 |
-
|
| 214 |
-
return curlSnapshot!;
|
| 215 |
-
}
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1 +0,0 @@
|
|
| 1 |
-
../../../thinapps-shared/backend
|
|
|
|
|
|
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
rules_version = '2';
|
| 2 |
-
service firebase.storage {
|
| 3 |
-
match /b/{bucket}/o {
|
| 4 |
-
match /{allPaths=**} {
|
| 5 |
-
allow read, write: if false;
|
| 6 |
-
}
|
| 7 |
-
}
|
| 8 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
File without changes
|
|
@@ -8,15 +8,16 @@
|
|
| 8 |
"dependencies": {
|
| 9 |
"@esm2cjs/normalize-url": "^8.0.0",
|
| 10 |
"@google-cloud/translate": "^8.2.0",
|
|
|
|
| 11 |
"@mozilla/readability": "^0.5.0",
|
| 12 |
-
"@napi-rs/canvas": "^0.1.
|
| 13 |
"@types/turndown": "^5.0.4",
|
| 14 |
"@xmldom/xmldom": "^0.9.3",
|
| 15 |
"archiver": "^6.0.1",
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
"busboy": "^1.6.0",
|
| 19 |
-
"civkit": "^0.8.
|
| 20 |
"core-js": "^3.37.1",
|
| 21 |
"cors": "^2.8.5",
|
| 22 |
"dayjs": "^1.11.9",
|
|
@@ -31,7 +32,7 @@
|
|
| 31 |
"minio": "^7.1.3",
|
| 32 |
"node-libcurl": "^4.1.0",
|
| 33 |
"openai": "^4.20.0",
|
| 34 |
-
"pdfjs-dist": "^4.
|
| 35 |
"puppeteer": "^23.3.0",
|
| 36 |
"puppeteer-extra": "^3.3.6",
|
| 37 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
|
@@ -53,6 +54,7 @@
|
|
| 53 |
"@types/busboy": "^1.5.4",
|
| 54 |
"@types/cors": "^2.8.17",
|
| 55 |
"@types/generic-pool": "^3.8.1",
|
|
|
|
| 56 |
"@types/node": "^20.14.13",
|
| 57 |
"@types/set-cookie-parser": "^2.4.7",
|
| 58 |
"@types/xmldom": "^0.1.34",
|
|
@@ -62,6 +64,7 @@
|
|
| 62 |
"eslint-config-google": "^0.14.0",
|
| 63 |
"eslint-plugin-import": "^2.25.4",
|
| 64 |
"firebase-functions-test": "^3.0.0",
|
|
|
|
| 65 |
"pino-pretty": "^13.0.0",
|
| 66 |
"replicate": "^0.16.1",
|
| 67 |
"typescript": "^5.5.4"
|
|
@@ -1626,6 +1629,23 @@
|
|
| 1626 |
"url": "https://opencollective.com/js-sdsl"
|
| 1627 |
}
|
| 1628 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1629 |
"node_modules/@koa/router": {
|
| 1630 |
"version": "12.0.1",
|
| 1631 |
"resolved": "https://registry.npmjs.org/@koa/router/-/router-12.0.1.tgz",
|
|
@@ -1679,30 +1699,30 @@
|
|
| 1679 |
}
|
| 1680 |
},
|
| 1681 |
"node_modules/@napi-rs/canvas": {
|
| 1682 |
-
"version": "0.1.
|
| 1683 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.
|
| 1684 |
-
"integrity": "sha512-
|
| 1685 |
"license": "MIT",
|
| 1686 |
"engines": {
|
| 1687 |
"node": ">= 10"
|
| 1688 |
},
|
| 1689 |
"optionalDependencies": {
|
| 1690 |
-
"@napi-rs/canvas-android-arm64": "0.1.
|
| 1691 |
-
"@napi-rs/canvas-darwin-arm64": "0.1.
|
| 1692 |
-
"@napi-rs/canvas-darwin-x64": "0.1.
|
| 1693 |
-
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.
|
| 1694 |
-
"@napi-rs/canvas-linux-arm64-gnu": "0.1.
|
| 1695 |
-
"@napi-rs/canvas-linux-arm64-musl": "0.1.
|
| 1696 |
-
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.
|
| 1697 |
-
"@napi-rs/canvas-linux-x64-gnu": "0.1.
|
| 1698 |
-
"@napi-rs/canvas-linux-x64-musl": "0.1.
|
| 1699 |
-
"@napi-rs/canvas-win32-x64-msvc": "0.1.
|
| 1700 |
}
|
| 1701 |
},
|
| 1702 |
"node_modules/@napi-rs/canvas-android-arm64": {
|
| 1703 |
-
"version": "0.1.
|
| 1704 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.
|
| 1705 |
-
"integrity": "sha512-
|
| 1706 |
"cpu": [
|
| 1707 |
"arm64"
|
| 1708 |
],
|
|
@@ -1716,9 +1736,9 @@
|
|
| 1716 |
}
|
| 1717 |
},
|
| 1718 |
"node_modules/@napi-rs/canvas-darwin-arm64": {
|
| 1719 |
-
"version": "0.1.
|
| 1720 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.
|
| 1721 |
-
"integrity": "sha512-
|
| 1722 |
"cpu": [
|
| 1723 |
"arm64"
|
| 1724 |
],
|
|
@@ -1732,9 +1752,9 @@
|
|
| 1732 |
}
|
| 1733 |
},
|
| 1734 |
"node_modules/@napi-rs/canvas-darwin-x64": {
|
| 1735 |
-
"version": "0.1.
|
| 1736 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.
|
| 1737 |
-
"integrity": "sha512-
|
| 1738 |
"cpu": [
|
| 1739 |
"x64"
|
| 1740 |
],
|
|
@@ -1748,9 +1768,9 @@
|
|
| 1748 |
}
|
| 1749 |
},
|
| 1750 |
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
|
| 1751 |
-
"version": "0.1.
|
| 1752 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.
|
| 1753 |
-
"integrity": "sha512-
|
| 1754 |
"cpu": [
|
| 1755 |
"arm"
|
| 1756 |
],
|
|
@@ -1764,9 +1784,9 @@
|
|
| 1764 |
}
|
| 1765 |
},
|
| 1766 |
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
|
| 1767 |
-
"version": "0.1.
|
| 1768 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.
|
| 1769 |
-
"integrity": "sha512-
|
| 1770 |
"cpu": [
|
| 1771 |
"arm64"
|
| 1772 |
],
|
|
@@ -1780,9 +1800,9 @@
|
|
| 1780 |
}
|
| 1781 |
},
|
| 1782 |
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
|
| 1783 |
-
"version": "0.1.
|
| 1784 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.
|
| 1785 |
-
"integrity": "sha512-
|
| 1786 |
"cpu": [
|
| 1787 |
"arm64"
|
| 1788 |
],
|
|
@@ -1796,9 +1816,9 @@
|
|
| 1796 |
}
|
| 1797 |
},
|
| 1798 |
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
|
| 1799 |
-
"version": "0.1.
|
| 1800 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.
|
| 1801 |
-
"integrity": "sha512-
|
| 1802 |
"cpu": [
|
| 1803 |
"riscv64"
|
| 1804 |
],
|
|
@@ -1812,9 +1832,9 @@
|
|
| 1812 |
}
|
| 1813 |
},
|
| 1814 |
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
|
| 1815 |
-
"version": "0.1.
|
| 1816 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.
|
| 1817 |
-
"integrity": "sha512-
|
| 1818 |
"cpu": [
|
| 1819 |
"x64"
|
| 1820 |
],
|
|
@@ -1828,9 +1848,9 @@
|
|
| 1828 |
}
|
| 1829 |
},
|
| 1830 |
"node_modules/@napi-rs/canvas-linux-x64-musl": {
|
| 1831 |
-
"version": "0.1.
|
| 1832 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.
|
| 1833 |
-
"integrity": "sha512-
|
| 1834 |
"cpu": [
|
| 1835 |
"x64"
|
| 1836 |
],
|
|
@@ -1844,9 +1864,9 @@
|
|
| 1844 |
}
|
| 1845 |
},
|
| 1846 |
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
|
| 1847 |
-
"version": "0.1.
|
| 1848 |
-
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.
|
| 1849 |
-
"integrity": "sha512-
|
| 1850 |
"cpu": [
|
| 1851 |
"x64"
|
| 1852 |
],
|
|
@@ -2238,6 +2258,16 @@
|
|
| 2238 |
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
|
| 2239 |
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
|
| 2240 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2241 |
"node_modules/@types/archiver": {
|
| 2242 |
"version": "5.3.4",
|
| 2243 |
"resolved": "https://registry.npmjs.org/@types/archiver/-/archiver-5.3.4.tgz",
|
|
@@ -2344,6 +2374,26 @@
|
|
| 2344 |
"@types/node": "*"
|
| 2345 |
}
|
| 2346 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2347 |
"node_modules/@types/cors": {
|
| 2348 |
"version": "2.8.17",
|
| 2349 |
"resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
|
|
@@ -2403,6 +2453,13 @@
|
|
| 2403 |
"@types/node": "*"
|
| 2404 |
}
|
| 2405 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2406 |
"node_modules/@types/http-cache-semantics": {
|
| 2407 |
"version": "4.0.4",
|
| 2408 |
"resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
|
|
@@ -2460,6 +2517,13 @@
|
|
| 2460 |
"@types/node": "*"
|
| 2461 |
}
|
| 2462 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2463 |
"node_modules/@types/keyv": {
|
| 2464 |
"version": "3.1.4",
|
| 2465 |
"resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
|
|
@@ -2468,6 +2532,33 @@
|
|
| 2468 |
"@types/node": "*"
|
| 2469 |
}
|
| 2470 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2471 |
"node_modules/@types/lodash": {
|
| 2472 |
"version": "4.17.0",
|
| 2473 |
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
|
|
@@ -3836,7 +3927,6 @@
|
|
| 3836 |
"version": "1.0.1",
|
| 3837 |
"resolved": "https://registry.npmjs.org/cache-content-type/-/cache-content-type-1.0.1.tgz",
|
| 3838 |
"integrity": "sha512-IKufZ1o4Ut42YUrZSo8+qnMTrFuKkvyoLXUywKz9GJ5BrhOFGhLdkx9sG4KAnVvbY6kEcSFjLQul+DVmBm2bgA==",
|
| 3839 |
-
"optional": true,
|
| 3840 |
"dependencies": {
|
| 3841 |
"mime-types": "^2.1.18",
|
| 3842 |
"ylru": "^1.2.0"
|
|
@@ -4005,9 +4095,10 @@
|
|
| 4005 |
}
|
| 4006 |
},
|
| 4007 |
"node_modules/civkit": {
|
| 4008 |
-
"version": "0.8.
|
| 4009 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.
|
| 4010 |
-
"integrity": "sha512-
|
|
|
|
| 4011 |
"dependencies": {
|
| 4012 |
"lodash": "^4.17.21",
|
| 4013 |
"tslib": "^2.5.0"
|
|
@@ -4138,7 +4229,6 @@
|
|
| 4138 |
"version": "4.6.0",
|
| 4139 |
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
|
| 4140 |
"integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
|
| 4141 |
-
"devOptional": true,
|
| 4142 |
"engines": {
|
| 4143 |
"iojs": ">= 1.0.0",
|
| 4144 |
"node": ">= 0.12.0"
|
|
@@ -4148,7 +4238,6 @@
|
|
| 4148 |
"version": "6.1.0",
|
| 4149 |
"resolved": "https://registry.npmjs.org/co-body/-/co-body-6.1.0.tgz",
|
| 4150 |
"integrity": "sha512-m7pOT6CdLN7FuXUcpuz/8lfQ/L77x8SchHCF4G0RBTJO20Wzmhn5Sp4/5WsKy8OSpifBSUrmg83qEqaDHdyFuQ==",
|
| 4151 |
-
"optional": true,
|
| 4152 |
"dependencies": {
|
| 4153 |
"inflation": "^2.0.0",
|
| 4154 |
"qs": "^6.5.2",
|
|
@@ -4273,7 +4362,6 @@
|
|
| 4273 |
"version": "0.9.1",
|
| 4274 |
"resolved": "https://registry.npmjs.org/cookies/-/cookies-0.9.1.tgz",
|
| 4275 |
"integrity": "sha512-TG2hpqe4ELx54QER/S3HQ9SRVnQnGBtKUz5bLQWtYAQ+o6GpgMs6sYUvaiJjVxb+UXwhRhAEP3m7LbsIZ77Hmw==",
|
| 4276 |
-
"optional": true,
|
| 4277 |
"dependencies": {
|
| 4278 |
"depd": "~2.0.0",
|
| 4279 |
"keygrip": "~1.1.0"
|
|
@@ -4582,8 +4670,7 @@
|
|
| 4582 |
"node_modules/deep-equal": {
|
| 4583 |
"version": "1.0.1",
|
| 4584 |
"resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz",
|
| 4585 |
-
"integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw=="
|
| 4586 |
-
"optional": true
|
| 4587 |
},
|
| 4588 |
"node_modules/deep-extend": {
|
| 4589 |
"version": "0.6.0",
|
|
@@ -6701,7 +6788,6 @@
|
|
| 6701 |
"version": "1.5.0",
|
| 6702 |
"resolved": "https://registry.npmjs.org/http-assert/-/http-assert-1.5.0.tgz",
|
| 6703 |
"integrity": "sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==",
|
| 6704 |
-
"optional": true,
|
| 6705 |
"dependencies": {
|
| 6706 |
"deep-equal": "~1.0.1",
|
| 6707 |
"http-errors": "~1.8.0"
|
|
@@ -6714,7 +6800,6 @@
|
|
| 6714 |
"version": "1.1.2",
|
| 6715 |
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
| 6716 |
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
|
| 6717 |
-
"optional": true,
|
| 6718 |
"engines": {
|
| 6719 |
"node": ">= 0.6"
|
| 6720 |
}
|
|
@@ -6723,7 +6808,6 @@
|
|
| 6723 |
"version": "1.8.1",
|
| 6724 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
|
| 6725 |
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
|
| 6726 |
-
"optional": true,
|
| 6727 |
"dependencies": {
|
| 6728 |
"depd": "~1.1.2",
|
| 6729 |
"inherits": "2.0.4",
|
|
@@ -6739,7 +6823,6 @@
|
|
| 6739 |
"version": "1.5.0",
|
| 6740 |
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
| 6741 |
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
|
| 6742 |
-
"optional": true,
|
| 6743 |
"engines": {
|
| 6744 |
"node": ">= 0.6"
|
| 6745 |
}
|
|
@@ -6940,7 +7023,6 @@
|
|
| 6940 |
"version": "2.1.0",
|
| 6941 |
"resolved": "https://registry.npmjs.org/inflation/-/inflation-2.1.0.tgz",
|
| 6942 |
"integrity": "sha512-t54PPJHG1Pp7VQvxyVCJ9mBbjG3Hqryges9bXoOO6GExCPa+//i/d5GSuFtpx3ALLd7lgIAur6zrIlBQyJuMlQ==",
|
| 6943 |
-
"optional": true,
|
| 6944 |
"engines": {
|
| 6945 |
"node": ">= 0.8.0"
|
| 6946 |
}
|
|
@@ -8316,7 +8398,6 @@
|
|
| 8316 |
"version": "1.1.0",
|
| 8317 |
"resolved": "https://registry.npmjs.org/keygrip/-/keygrip-1.1.0.tgz",
|
| 8318 |
"integrity": "sha512-iYSchDJ+liQ8iwbSI2QqsQOvqv58eJCEanyJPJi+Khyu8smkcKSFUCbPwzFcL7YVtZ6eONjqRX/38caJ7QjRAQ==",
|
| 8319 |
-
"optional": true,
|
| 8320 |
"dependencies": {
|
| 8321 |
"tsscmp": "1.0.6"
|
| 8322 |
},
|
|
@@ -8354,10 +8435,10 @@
|
|
| 8354 |
}
|
| 8355 |
},
|
| 8356 |
"node_modules/koa": {
|
| 8357 |
-
"version": "2.
|
| 8358 |
-
"resolved": "https://registry.npmjs.org/koa/-/koa-2.
|
| 8359 |
-
"integrity": "sha512-
|
| 8360 |
-
"
|
| 8361 |
"dependencies": {
|
| 8362 |
"accepts": "^1.3.5",
|
| 8363 |
"cache-content-type": "^1.0.0",
|
|
@@ -8404,14 +8485,12 @@
|
|
| 8404 |
"node_modules/koa-compose": {
|
| 8405 |
"version": "4.1.0",
|
| 8406 |
"resolved": "https://registry.npmjs.org/koa-compose/-/koa-compose-4.1.0.tgz",
|
| 8407 |
-
"integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw=="
|
| 8408 |
-
"optional": true
|
| 8409 |
},
|
| 8410 |
"node_modules/koa-convert": {
|
| 8411 |
"version": "2.0.0",
|
| 8412 |
"resolved": "https://registry.npmjs.org/koa-convert/-/koa-convert-2.0.0.tgz",
|
| 8413 |
"integrity": "sha512-asOvN6bFlSnxewce2e/DK3p4tltyfC4VM7ZwuTuepI7dEQVcvpyFuBcEARu1+Hxg8DIwytce2n7jrZtRlPrARA==",
|
| 8414 |
-
"optional": true,
|
| 8415 |
"dependencies": {
|
| 8416 |
"co": "^4.6.0",
|
| 8417 |
"koa-compose": "^4.1.0"
|
|
@@ -8424,7 +8503,6 @@
|
|
| 8424 |
"version": "1.8.1",
|
| 8425 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
|
| 8426 |
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
|
| 8427 |
-
"optional": true,
|
| 8428 |
"dependencies": {
|
| 8429 |
"depd": "~1.1.2",
|
| 8430 |
"inherits": "2.0.4",
|
|
@@ -8440,7 +8518,6 @@
|
|
| 8440 |
"version": "1.1.2",
|
| 8441 |
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
| 8442 |
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
|
| 8443 |
-
"optional": true,
|
| 8444 |
"engines": {
|
| 8445 |
"node": ">= 0.6"
|
| 8446 |
}
|
|
@@ -8449,7 +8526,6 @@
|
|
| 8449 |
"version": "1.5.0",
|
| 8450 |
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
| 8451 |
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
|
| 8452 |
-
"optional": true,
|
| 8453 |
"engines": {
|
| 8454 |
"node": ">= 0.6"
|
| 8455 |
}
|
|
@@ -8644,8 +8720,7 @@
|
|
| 8644 |
"node_modules/lodash.merge": {
|
| 8645 |
"version": "4.6.2",
|
| 8646 |
"resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
|
| 8647 |
-
"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ=="
|
| 8648 |
-
"dev": true
|
| 8649 |
},
|
| 8650 |
"node_modules/lodash.once": {
|
| 8651 |
"version": "4.1.1",
|
|
@@ -9853,8 +9928,7 @@
|
|
| 9853 |
"node_modules/only": {
|
| 9854 |
"version": "0.0.2",
|
| 9855 |
"resolved": "https://registry.npmjs.org/only/-/only-0.0.2.tgz",
|
| 9856 |
-
"integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ=="
|
| 9857 |
-
"optional": true
|
| 9858 |
},
|
| 9859 |
"node_modules/openai": {
|
| 9860 |
"version": "4.33.0",
|
|
@@ -10118,15 +10192,15 @@
|
|
| 10118 |
}
|
| 10119 |
},
|
| 10120 |
"node_modules/pdfjs-dist": {
|
| 10121 |
-
"version": "4.
|
| 10122 |
-
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.
|
| 10123 |
-
"integrity": "sha512-
|
|
|
|
| 10124 |
"engines": {
|
| 10125 |
-
"node": ">=
|
| 10126 |
},
|
| 10127 |
"optionalDependencies": {
|
| 10128 |
-
"canvas": "^
|
| 10129 |
-
"path2d": "^0.2.0"
|
| 10130 |
}
|
| 10131 |
},
|
| 10132 |
"node_modules/peek-stream": {
|
|
@@ -12443,7 +12517,6 @@
|
|
| 12443 |
"version": "1.0.6",
|
| 12444 |
"resolved": "https://registry.npmjs.org/tsscmp/-/tsscmp-1.0.6.tgz",
|
| 12445 |
"integrity": "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA==",
|
| 12446 |
-
"optional": true,
|
| 12447 |
"engines": {
|
| 12448 |
"node": ">=0.6.x"
|
| 12449 |
}
|
|
@@ -13136,7 +13209,6 @@
|
|
| 13136 |
"version": "1.4.0",
|
| 13137 |
"resolved": "https://registry.npmjs.org/ylru/-/ylru-1.4.0.tgz",
|
| 13138 |
"integrity": "sha512-2OQsPNEmBCvXuFlIni/a+Rn+R2pHW9INm0BxXJ4hVDA8TirqMj+J/Rp9ItLatT/5pZqWwefVrTQcHpixsxnVlA==",
|
| 13139 |
-
"optional": true,
|
| 13140 |
"engines": {
|
| 13141 |
"node": ">= 4.0.0"
|
| 13142 |
}
|
|
|
|
| 8 |
"dependencies": {
|
| 9 |
"@esm2cjs/normalize-url": "^8.0.0",
|
| 10 |
"@google-cloud/translate": "^8.2.0",
|
| 11 |
+
"@koa/bodyparser": "^5.1.1",
|
| 12 |
"@mozilla/readability": "^0.5.0",
|
| 13 |
+
"@napi-rs/canvas": "^0.1.68",
|
| 14 |
"@types/turndown": "^5.0.4",
|
| 15 |
"@xmldom/xmldom": "^0.9.3",
|
| 16 |
"archiver": "^6.0.1",
|
| 17 |
"axios": "^1.3.3",
|
| 18 |
"bcrypt": "^5.1.0",
|
| 19 |
"busboy": "^1.6.0",
|
| 20 |
+
"civkit": "^0.8.4-32482a3",
|
| 21 |
"core-js": "^3.37.1",
|
| 22 |
"cors": "^2.8.5",
|
| 23 |
"dayjs": "^1.11.9",
|
|
|
|
| 32 |
"minio": "^7.1.3",
|
| 33 |
"node-libcurl": "^4.1.0",
|
| 34 |
"openai": "^4.20.0",
|
| 35 |
+
"pdfjs-dist": "^4.10.38",
|
| 36 |
"puppeteer": "^23.3.0",
|
| 37 |
"puppeteer-extra": "^3.3.6",
|
| 38 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
|
|
|
| 54 |
"@types/busboy": "^1.5.4",
|
| 55 |
"@types/cors": "^2.8.17",
|
| 56 |
"@types/generic-pool": "^3.8.1",
|
| 57 |
+
"@types/koa": "^2.15.0",
|
| 58 |
"@types/node": "^20.14.13",
|
| 59 |
"@types/set-cookie-parser": "^2.4.7",
|
| 60 |
"@types/xmldom": "^0.1.34",
|
|
|
|
| 64 |
"eslint-config-google": "^0.14.0",
|
| 65 |
"eslint-plugin-import": "^2.25.4",
|
| 66 |
"firebase-functions-test": "^3.0.0",
|
| 67 |
+
"koa": "^2.16.0",
|
| 68 |
"pino-pretty": "^13.0.0",
|
| 69 |
"replicate": "^0.16.1",
|
| 70 |
"typescript": "^5.5.4"
|
|
|
|
| 1629 |
"url": "https://opencollective.com/js-sdsl"
|
| 1630 |
}
|
| 1631 |
},
|
| 1632 |
+
"node_modules/@koa/bodyparser": {
|
| 1633 |
+
"version": "5.1.1",
|
| 1634 |
+
"resolved": "https://registry.npmjs.org/@koa/bodyparser/-/bodyparser-5.1.1.tgz",
|
| 1635 |
+
"integrity": "sha512-ZBF49xqNVxnmJ+8iXegq+fXPQm9RSX8giNl/aXS5rW1VpNct92wnFbGR/47vfoRJVLARGQ4HVL4WaQ0u8IJVoA==",
|
| 1636 |
+
"license": "MIT",
|
| 1637 |
+
"dependencies": {
|
| 1638 |
+
"co-body": "^6.1.0",
|
| 1639 |
+
"lodash.merge": "^4.6.2",
|
| 1640 |
+
"type-is": "^1.6.18"
|
| 1641 |
+
},
|
| 1642 |
+
"engines": {
|
| 1643 |
+
"node": ">= 16"
|
| 1644 |
+
},
|
| 1645 |
+
"peerDependencies": {
|
| 1646 |
+
"koa": "^2.14.1"
|
| 1647 |
+
}
|
| 1648 |
+
},
|
| 1649 |
"node_modules/@koa/router": {
|
| 1650 |
"version": "12.0.1",
|
| 1651 |
"resolved": "https://registry.npmjs.org/@koa/router/-/router-12.0.1.tgz",
|
|
|
|
| 1699 |
}
|
| 1700 |
},
|
| 1701 |
"node_modules/@napi-rs/canvas": {
|
| 1702 |
+
"version": "0.1.68",
|
| 1703 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.68.tgz",
|
| 1704 |
+
"integrity": "sha512-LQESrePLEBLvhuFkXx9jjBXRC2ClYsO5mqQ1m/puth5z9SOuM3N/B3vDuqnC3RJFktDktyK9khGvo7dTkqO9uQ==",
|
| 1705 |
"license": "MIT",
|
| 1706 |
"engines": {
|
| 1707 |
"node": ">= 10"
|
| 1708 |
},
|
| 1709 |
"optionalDependencies": {
|
| 1710 |
+
"@napi-rs/canvas-android-arm64": "0.1.68",
|
| 1711 |
+
"@napi-rs/canvas-darwin-arm64": "0.1.68",
|
| 1712 |
+
"@napi-rs/canvas-darwin-x64": "0.1.68",
|
| 1713 |
+
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.68",
|
| 1714 |
+
"@napi-rs/canvas-linux-arm64-gnu": "0.1.68",
|
| 1715 |
+
"@napi-rs/canvas-linux-arm64-musl": "0.1.68",
|
| 1716 |
+
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.68",
|
| 1717 |
+
"@napi-rs/canvas-linux-x64-gnu": "0.1.68",
|
| 1718 |
+
"@napi-rs/canvas-linux-x64-musl": "0.1.68",
|
| 1719 |
+
"@napi-rs/canvas-win32-x64-msvc": "0.1.68"
|
| 1720 |
}
|
| 1721 |
},
|
| 1722 |
"node_modules/@napi-rs/canvas-android-arm64": {
|
| 1723 |
+
"version": "0.1.68",
|
| 1724 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.68.tgz",
|
| 1725 |
+
"integrity": "sha512-h1KcSR4LKLfRfzeBH65xMxbWOGa1OtMFQbCMVlxPCkN1Zr+2gK+70pXO5ktojIYcUrP6KDcOwoc8clho5ccM/w==",
|
| 1726 |
"cpu": [
|
| 1727 |
"arm64"
|
| 1728 |
],
|
|
|
|
| 1736 |
}
|
| 1737 |
},
|
| 1738 |
"node_modules/@napi-rs/canvas-darwin-arm64": {
|
| 1739 |
+
"version": "0.1.68",
|
| 1740 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.68.tgz",
|
| 1741 |
+
"integrity": "sha512-/VURlrAD4gDoxW1GT/b0nP3fRz/fhxmHI/xznTq2FTwkQLPOlLkDLCvTmQ7v6LtGKdc2Ed6rvYpRan+JXThInQ==",
|
| 1742 |
"cpu": [
|
| 1743 |
"arm64"
|
| 1744 |
],
|
|
|
|
| 1752 |
}
|
| 1753 |
},
|
| 1754 |
"node_modules/@napi-rs/canvas-darwin-x64": {
|
| 1755 |
+
"version": "0.1.68",
|
| 1756 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.68.tgz",
|
| 1757 |
+
"integrity": "sha512-tEpvGR6vCLTo1Tx9wmDnoOKROpw57wiCWwCpDOuVlj/7rqEJOUYr9ixW4aRJgmeGBrZHgevI0EURys2ER6whmg==",
|
| 1758 |
"cpu": [
|
| 1759 |
"x64"
|
| 1760 |
],
|
|
|
|
| 1768 |
}
|
| 1769 |
},
|
| 1770 |
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
|
| 1771 |
+
"version": "0.1.68",
|
| 1772 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.68.tgz",
|
| 1773 |
+
"integrity": "sha512-U9xbJsumPOiAYeAFZMlHf62b9dGs2HJ6Q5xt7xTB0uEyPeurwhgYBWGgabdsEidyj38YuzI/c3LGBbSQB3vagw==",
|
| 1774 |
"cpu": [
|
| 1775 |
"arm"
|
| 1776 |
],
|
|
|
|
| 1784 |
}
|
| 1785 |
},
|
| 1786 |
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
|
| 1787 |
+
"version": "0.1.68",
|
| 1788 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.68.tgz",
|
| 1789 |
+
"integrity": "sha512-KFkn8wEm3mPnWD4l8+OUUkxylSJuN5q9PnJRZJgv15RtCA1bgxIwTkBhI/+xuyVMcHqON9sXq7cDkEJtHm35dg==",
|
| 1790 |
"cpu": [
|
| 1791 |
"arm64"
|
| 1792 |
],
|
|
|
|
| 1800 |
}
|
| 1801 |
},
|
| 1802 |
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
|
| 1803 |
+
"version": "0.1.68",
|
| 1804 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.68.tgz",
|
| 1805 |
+
"integrity": "sha512-IQzts91rCdOALXBWQxLZRCEDrfFTGDtNRJMNu+2SKZ1uT8cmPQkPwVk5rycvFpvgAcmiFiOSCp1aRrlfU8KPpQ==",
|
| 1806 |
"cpu": [
|
| 1807 |
"arm64"
|
| 1808 |
],
|
|
|
|
| 1816 |
}
|
| 1817 |
},
|
| 1818 |
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
|
| 1819 |
+
"version": "0.1.68",
|
| 1820 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.68.tgz",
|
| 1821 |
+
"integrity": "sha512-e9AS5UttoIKqXSmBzKZdd3NErSVyOEYzJfNOCGtafGk1//gibTwQXGlSXmAKuErqMp09pyk9aqQRSYzm1AQfBw==",
|
| 1822 |
"cpu": [
|
| 1823 |
"riscv64"
|
| 1824 |
],
|
|
|
|
| 1832 |
}
|
| 1833 |
},
|
| 1834 |
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
|
| 1835 |
+
"version": "0.1.68",
|
| 1836 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.68.tgz",
|
| 1837 |
+
"integrity": "sha512-Pa/I36VE3j57I3Obhrr+J48KGFfkZk2cJN/2NmW/vCgmoF7kCP6aTVq5n+cGdGWLd/cN9CJ9JvNwEoMRDghu0g==",
|
| 1838 |
"cpu": [
|
| 1839 |
"x64"
|
| 1840 |
],
|
|
|
|
| 1848 |
}
|
| 1849 |
},
|
| 1850 |
"node_modules/@napi-rs/canvas-linux-x64-musl": {
|
| 1851 |
+
"version": "0.1.68",
|
| 1852 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.68.tgz",
|
| 1853 |
+
"integrity": "sha512-9c6rkc5195wNxuUHJdf4/mmnq433OQey9TNvQ9LspJazvHbfSkTij8wtKjASVQsJyPDva4fkWOeV/OQ7cLw0GQ==",
|
| 1854 |
"cpu": [
|
| 1855 |
"x64"
|
| 1856 |
],
|
|
|
|
| 1864 |
}
|
| 1865 |
},
|
| 1866 |
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
|
| 1867 |
+
"version": "0.1.68",
|
| 1868 |
+
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.68.tgz",
|
| 1869 |
+
"integrity": "sha512-Fc5Dez23u0FoSATurT6/w1oMytiRnKWEinHivdMvXpge6nG4YvhrASrtqMk8dGJMVQpHr8QJYF45rOrx2YU2Aw==",
|
| 1870 |
"cpu": [
|
| 1871 |
"x64"
|
| 1872 |
],
|
|
|
|
| 2258 |
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
|
| 2259 |
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
|
| 2260 |
},
|
| 2261 |
+
"node_modules/@types/accepts": {
|
| 2262 |
+
"version": "1.3.7",
|
| 2263 |
+
"resolved": "https://registry.npmjs.org/@types/accepts/-/accepts-1.3.7.tgz",
|
| 2264 |
+
"integrity": "sha512-Pay9fq2lM2wXPWbteBsRAGiWH2hig4ZE2asK+mm7kUzlxRTfL961rj89I6zV/E3PcIkDqyuBEcMxFT7rccugeQ==",
|
| 2265 |
+
"dev": true,
|
| 2266 |
+
"license": "MIT",
|
| 2267 |
+
"dependencies": {
|
| 2268 |
+
"@types/node": "*"
|
| 2269 |
+
}
|
| 2270 |
+
},
|
| 2271 |
"node_modules/@types/archiver": {
|
| 2272 |
"version": "5.3.4",
|
| 2273 |
"resolved": "https://registry.npmjs.org/@types/archiver/-/archiver-5.3.4.tgz",
|
|
|
|
| 2374 |
"@types/node": "*"
|
| 2375 |
}
|
| 2376 |
},
|
| 2377 |
+
"node_modules/@types/content-disposition": {
|
| 2378 |
+
"version": "0.5.8",
|
| 2379 |
+
"resolved": "https://registry.npmjs.org/@types/content-disposition/-/content-disposition-0.5.8.tgz",
|
| 2380 |
+
"integrity": "sha512-QVSSvno3dE0MgO76pJhmv4Qyi/j0Yk9pBp0Y7TJ2Tlj+KCgJWY6qX7nnxCOLkZ3VYRSIk1WTxCvwUSdx6CCLdg==",
|
| 2381 |
+
"dev": true,
|
| 2382 |
+
"license": "MIT"
|
| 2383 |
+
},
|
| 2384 |
+
"node_modules/@types/cookies": {
|
| 2385 |
+
"version": "0.9.0",
|
| 2386 |
+
"resolved": "https://registry.npmjs.org/@types/cookies/-/cookies-0.9.0.tgz",
|
| 2387 |
+
"integrity": "sha512-40Zk8qR147RABiQ7NQnBzWzDcjKzNrntB5BAmeGCb2p/MIyOE+4BVvc17wumsUqUw00bJYqoXFHYygQnEFh4/Q==",
|
| 2388 |
+
"dev": true,
|
| 2389 |
+
"license": "MIT",
|
| 2390 |
+
"dependencies": {
|
| 2391 |
+
"@types/connect": "*",
|
| 2392 |
+
"@types/express": "*",
|
| 2393 |
+
"@types/keygrip": "*",
|
| 2394 |
+
"@types/node": "*"
|
| 2395 |
+
}
|
| 2396 |
+
},
|
| 2397 |
"node_modules/@types/cors": {
|
| 2398 |
"version": "2.8.17",
|
| 2399 |
"resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
|
|
|
|
| 2453 |
"@types/node": "*"
|
| 2454 |
}
|
| 2455 |
},
|
| 2456 |
+
"node_modules/@types/http-assert": {
|
| 2457 |
+
"version": "1.5.6",
|
| 2458 |
+
"resolved": "https://registry.npmjs.org/@types/http-assert/-/http-assert-1.5.6.tgz",
|
| 2459 |
+
"integrity": "sha512-TTEwmtjgVbYAzZYWyeHPrrtWnfVkm8tQkP8P21uQifPgMRgjrow3XDEYqucuC8SKZJT7pUnhU/JymvjggxO9vw==",
|
| 2460 |
+
"dev": true,
|
| 2461 |
+
"license": "MIT"
|
| 2462 |
+
},
|
| 2463 |
"node_modules/@types/http-cache-semantics": {
|
| 2464 |
"version": "4.0.4",
|
| 2465 |
"resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
|
|
|
|
| 2517 |
"@types/node": "*"
|
| 2518 |
}
|
| 2519 |
},
|
| 2520 |
+
"node_modules/@types/keygrip": {
|
| 2521 |
+
"version": "1.0.6",
|
| 2522 |
+
"resolved": "https://registry.npmjs.org/@types/keygrip/-/keygrip-1.0.6.tgz",
|
| 2523 |
+
"integrity": "sha512-lZuNAY9xeJt7Bx4t4dx0rYCDqGPW8RXhQZK1td7d4H6E9zYbLoOtjBvfwdTKpsyxQI/2jv+armjX/RW+ZNpXOQ==",
|
| 2524 |
+
"dev": true,
|
| 2525 |
+
"license": "MIT"
|
| 2526 |
+
},
|
| 2527 |
"node_modules/@types/keyv": {
|
| 2528 |
"version": "3.1.4",
|
| 2529 |
"resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
|
|
|
|
| 2532 |
"@types/node": "*"
|
| 2533 |
}
|
| 2534 |
},
|
| 2535 |
+
"node_modules/@types/koa": {
|
| 2536 |
+
"version": "2.15.0",
|
| 2537 |
+
"resolved": "https://registry.npmjs.org/@types/koa/-/koa-2.15.0.tgz",
|
| 2538 |
+
"integrity": "sha512-7QFsywoE5URbuVnG3loe03QXuGajrnotr3gQkXcEBShORai23MePfFYdhz90FEtBBpkyIYQbVD+evKtloCgX3g==",
|
| 2539 |
+
"dev": true,
|
| 2540 |
+
"license": "MIT",
|
| 2541 |
+
"dependencies": {
|
| 2542 |
+
"@types/accepts": "*",
|
| 2543 |
+
"@types/content-disposition": "*",
|
| 2544 |
+
"@types/cookies": "*",
|
| 2545 |
+
"@types/http-assert": "*",
|
| 2546 |
+
"@types/http-errors": "*",
|
| 2547 |
+
"@types/keygrip": "*",
|
| 2548 |
+
"@types/koa-compose": "*",
|
| 2549 |
+
"@types/node": "*"
|
| 2550 |
+
}
|
| 2551 |
+
},
|
| 2552 |
+
"node_modules/@types/koa-compose": {
|
| 2553 |
+
"version": "3.2.8",
|
| 2554 |
+
"resolved": "https://registry.npmjs.org/@types/koa-compose/-/koa-compose-3.2.8.tgz",
|
| 2555 |
+
"integrity": "sha512-4Olc63RY+MKvxMwVknCUDhRQX1pFQoBZ/lXcRLP69PQkEpze/0cr8LNqJQe5NFb/b19DWi2a5bTi2VAlQzhJuA==",
|
| 2556 |
+
"dev": true,
|
| 2557 |
+
"license": "MIT",
|
| 2558 |
+
"dependencies": {
|
| 2559 |
+
"@types/koa": "*"
|
| 2560 |
+
}
|
| 2561 |
+
},
|
| 2562 |
"node_modules/@types/lodash": {
|
| 2563 |
"version": "4.17.0",
|
| 2564 |
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
|
|
|
|
| 3927 |
"version": "1.0.1",
|
| 3928 |
"resolved": "https://registry.npmjs.org/cache-content-type/-/cache-content-type-1.0.1.tgz",
|
| 3929 |
"integrity": "sha512-IKufZ1o4Ut42YUrZSo8+qnMTrFuKkvyoLXUywKz9GJ5BrhOFGhLdkx9sG4KAnVvbY6kEcSFjLQul+DVmBm2bgA==",
|
|
|
|
| 3930 |
"dependencies": {
|
| 3931 |
"mime-types": "^2.1.18",
|
| 3932 |
"ylru": "^1.2.0"
|
|
|
|
| 4095 |
}
|
| 4096 |
},
|
| 4097 |
"node_modules/civkit": {
|
| 4098 |
+
"version": "0.8.4-32482a3",
|
| 4099 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-32482a3.tgz",
|
| 4100 |
+
"integrity": "sha512-VQwRreeVKYEoSMlhwYrPGpAA5na6lrIavGKmYNrhsHVJEvSfgkWKEete/btZzer4+WBxnNRw+PpRPrq6xjt13Q==",
|
| 4101 |
+
"license": "AGPL",
|
| 4102 |
"dependencies": {
|
| 4103 |
"lodash": "^4.17.21",
|
| 4104 |
"tslib": "^2.5.0"
|
|
|
|
| 4229 |
"version": "4.6.0",
|
| 4230 |
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
|
| 4231 |
"integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
|
|
|
|
| 4232 |
"engines": {
|
| 4233 |
"iojs": ">= 1.0.0",
|
| 4234 |
"node": ">= 0.12.0"
|
|
|
|
| 4238 |
"version": "6.1.0",
|
| 4239 |
"resolved": "https://registry.npmjs.org/co-body/-/co-body-6.1.0.tgz",
|
| 4240 |
"integrity": "sha512-m7pOT6CdLN7FuXUcpuz/8lfQ/L77x8SchHCF4G0RBTJO20Wzmhn5Sp4/5WsKy8OSpifBSUrmg83qEqaDHdyFuQ==",
|
|
|
|
| 4241 |
"dependencies": {
|
| 4242 |
"inflation": "^2.0.0",
|
| 4243 |
"qs": "^6.5.2",
|
|
|
|
| 4362 |
"version": "0.9.1",
|
| 4363 |
"resolved": "https://registry.npmjs.org/cookies/-/cookies-0.9.1.tgz",
|
| 4364 |
"integrity": "sha512-TG2hpqe4ELx54QER/S3HQ9SRVnQnGBtKUz5bLQWtYAQ+o6GpgMs6sYUvaiJjVxb+UXwhRhAEP3m7LbsIZ77Hmw==",
|
|
|
|
| 4365 |
"dependencies": {
|
| 4366 |
"depd": "~2.0.0",
|
| 4367 |
"keygrip": "~1.1.0"
|
|
|
|
| 4670 |
"node_modules/deep-equal": {
|
| 4671 |
"version": "1.0.1",
|
| 4672 |
"resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz",
|
| 4673 |
+
"integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw=="
|
|
|
|
| 4674 |
},
|
| 4675 |
"node_modules/deep-extend": {
|
| 4676 |
"version": "0.6.0",
|
|
|
|
| 6788 |
"version": "1.5.0",
|
| 6789 |
"resolved": "https://registry.npmjs.org/http-assert/-/http-assert-1.5.0.tgz",
|
| 6790 |
"integrity": "sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==",
|
|
|
|
| 6791 |
"dependencies": {
|
| 6792 |
"deep-equal": "~1.0.1",
|
| 6793 |
"http-errors": "~1.8.0"
|
|
|
|
| 6800 |
"version": "1.1.2",
|
| 6801 |
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
| 6802 |
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
|
|
|
|
| 6803 |
"engines": {
|
| 6804 |
"node": ">= 0.6"
|
| 6805 |
}
|
|
|
|
| 6808 |
"version": "1.8.1",
|
| 6809 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
|
| 6810 |
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
|
|
|
|
| 6811 |
"dependencies": {
|
| 6812 |
"depd": "~1.1.2",
|
| 6813 |
"inherits": "2.0.4",
|
|
|
|
| 6823 |
"version": "1.5.0",
|
| 6824 |
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
| 6825 |
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
|
|
|
|
| 6826 |
"engines": {
|
| 6827 |
"node": ">= 0.6"
|
| 6828 |
}
|
|
|
|
| 7023 |
"version": "2.1.0",
|
| 7024 |
"resolved": "https://registry.npmjs.org/inflation/-/inflation-2.1.0.tgz",
|
| 7025 |
"integrity": "sha512-t54PPJHG1Pp7VQvxyVCJ9mBbjG3Hqryges9bXoOO6GExCPa+//i/d5GSuFtpx3ALLd7lgIAur6zrIlBQyJuMlQ==",
|
|
|
|
| 7026 |
"engines": {
|
| 7027 |
"node": ">= 0.8.0"
|
| 7028 |
}
|
|
|
|
| 8398 |
"version": "1.1.0",
|
| 8399 |
"resolved": "https://registry.npmjs.org/keygrip/-/keygrip-1.1.0.tgz",
|
| 8400 |
"integrity": "sha512-iYSchDJ+liQ8iwbSI2QqsQOvqv58eJCEanyJPJi+Khyu8smkcKSFUCbPwzFcL7YVtZ6eONjqRX/38caJ7QjRAQ==",
|
|
|
|
| 8401 |
"dependencies": {
|
| 8402 |
"tsscmp": "1.0.6"
|
| 8403 |
},
|
|
|
|
| 8435 |
}
|
| 8436 |
},
|
| 8437 |
"node_modules/koa": {
|
| 8438 |
+
"version": "2.16.0",
|
| 8439 |
+
"resolved": "https://registry.npmjs.org/koa/-/koa-2.16.0.tgz",
|
| 8440 |
+
"integrity": "sha512-Afhqq0Vq3W7C+/rW6IqHVBDLzqObwZ07JaUNUEF8yCQ6afiyFE3RAy+i7V0E46XOWlH7vPWn/x0vsZwNy6PWxw==",
|
| 8441 |
+
"license": "MIT",
|
| 8442 |
"dependencies": {
|
| 8443 |
"accepts": "^1.3.5",
|
| 8444 |
"cache-content-type": "^1.0.0",
|
|
|
|
| 8485 |
"node_modules/koa-compose": {
|
| 8486 |
"version": "4.1.0",
|
| 8487 |
"resolved": "https://registry.npmjs.org/koa-compose/-/koa-compose-4.1.0.tgz",
|
| 8488 |
+
"integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw=="
|
|
|
|
| 8489 |
},
|
| 8490 |
"node_modules/koa-convert": {
|
| 8491 |
"version": "2.0.0",
|
| 8492 |
"resolved": "https://registry.npmjs.org/koa-convert/-/koa-convert-2.0.0.tgz",
|
| 8493 |
"integrity": "sha512-asOvN6bFlSnxewce2e/DK3p4tltyfC4VM7ZwuTuepI7dEQVcvpyFuBcEARu1+Hxg8DIwytce2n7jrZtRlPrARA==",
|
|
|
|
| 8494 |
"dependencies": {
|
| 8495 |
"co": "^4.6.0",
|
| 8496 |
"koa-compose": "^4.1.0"
|
|
|
|
| 8503 |
"version": "1.8.1",
|
| 8504 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
|
| 8505 |
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
|
|
|
|
| 8506 |
"dependencies": {
|
| 8507 |
"depd": "~1.1.2",
|
| 8508 |
"inherits": "2.0.4",
|
|
|
|
| 8518 |
"version": "1.1.2",
|
| 8519 |
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
| 8520 |
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
|
|
|
|
| 8521 |
"engines": {
|
| 8522 |
"node": ">= 0.6"
|
| 8523 |
}
|
|
|
|
| 8526 |
"version": "1.5.0",
|
| 8527 |
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
| 8528 |
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
|
|
|
|
| 8529 |
"engines": {
|
| 8530 |
"node": ">= 0.6"
|
| 8531 |
}
|
|
|
|
| 8720 |
"node_modules/lodash.merge": {
|
| 8721 |
"version": "4.6.2",
|
| 8722 |
"resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
|
| 8723 |
+
"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ=="
|
|
|
|
| 8724 |
},
|
| 8725 |
"node_modules/lodash.once": {
|
| 8726 |
"version": "4.1.1",
|
|
|
|
| 9928 |
"node_modules/only": {
|
| 9929 |
"version": "0.0.2",
|
| 9930 |
"resolved": "https://registry.npmjs.org/only/-/only-0.0.2.tgz",
|
| 9931 |
+
"integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ=="
|
|
|
|
| 9932 |
},
|
| 9933 |
"node_modules/openai": {
|
| 9934 |
"version": "4.33.0",
|
|
|
|
| 10192 |
}
|
| 10193 |
},
|
| 10194 |
"node_modules/pdfjs-dist": {
|
| 10195 |
+
"version": "4.10.38",
|
| 10196 |
+
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.10.38.tgz",
|
| 10197 |
+
"integrity": "sha512-/Y3fcFrXEAsMjJXeL9J8+ZG9U01LbuWaYypvDW2ycW1jL269L3js3DVBjDJ0Up9Np1uqDXsDrRihHANhZOlwdQ==",
|
| 10198 |
+
"license": "Apache-2.0",
|
| 10199 |
"engines": {
|
| 10200 |
+
"node": ">=20"
|
| 10201 |
},
|
| 10202 |
"optionalDependencies": {
|
| 10203 |
+
"@napi-rs/canvas": "^0.1.65"
|
|
|
|
| 10204 |
}
|
| 10205 |
},
|
| 10206 |
"node_modules/peek-stream": {
|
|
|
|
| 12517 |
"version": "1.0.6",
|
| 12518 |
"resolved": "https://registry.npmjs.org/tsscmp/-/tsscmp-1.0.6.tgz",
|
| 12519 |
"integrity": "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA==",
|
|
|
|
| 12520 |
"engines": {
|
| 12521 |
"node": ">=0.6.x"
|
| 12522 |
}
|
|
|
|
| 13209 |
"version": "1.4.0",
|
| 13210 |
"resolved": "https://registry.npmjs.org/ylru/-/ylru-1.4.0.tgz",
|
| 13211 |
"integrity": "sha512-2OQsPNEmBCvXuFlIni/a+Rn+R2pHW9INm0BxXJ4hVDA8TirqMj+J/Rp9ItLatT/5pZqWwefVrTQcHpixsxnVlA==",
|
|
|
|
| 13212 |
"engines": {
|
| 13213 |
"node": ">= 4.0.0"
|
| 13214 |
}
|
|
@@ -1,15 +1,84 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
"
|
| 10 |
-
"
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{
|
| 2 |
+
"name": "reader",
|
| 3 |
+
"scripts": {
|
| 4 |
+
"lint": "eslint --ext .js,.ts .",
|
| 5 |
+
"build": "node ./integrity-check.cjs && tsc -p .",
|
| 6 |
+
"build:watch": "tsc --watch",
|
| 7 |
+
"build:clean": "rm -rf ./build",
|
| 8 |
+
"serve": "npm run build && npm run start",
|
| 9 |
+
"debug": "npm run build && npm run dev",
|
| 10 |
+
"start": "npm run shell"
|
| 11 |
+
},
|
| 12 |
+
"engines": {
|
| 13 |
+
"node": "20"
|
| 14 |
+
},
|
| 15 |
+
"main": "build/index.js",
|
| 16 |
+
"dependencies": {
|
| 17 |
+
"@esm2cjs/normalize-url": "^8.0.0",
|
| 18 |
+
"@google-cloud/translate": "^8.2.0",
|
| 19 |
+
"@koa/bodyparser": "^5.1.1",
|
| 20 |
+
"@mozilla/readability": "^0.5.0",
|
| 21 |
+
"@napi-rs/canvas": "^0.1.68",
|
| 22 |
+
"@types/turndown": "^5.0.4",
|
| 23 |
+
"@xmldom/xmldom": "^0.9.3",
|
| 24 |
+
"archiver": "^6.0.1",
|
| 25 |
+
"axios": "^1.3.3",
|
| 26 |
+
"bcrypt": "^5.1.0",
|
| 27 |
+
"busboy": "^1.6.0",
|
| 28 |
+
"civkit": "^0.8.4-32482a3",
|
| 29 |
+
"core-js": "^3.37.1",
|
| 30 |
+
"cors": "^2.8.5",
|
| 31 |
+
"dayjs": "^1.11.9",
|
| 32 |
+
"express": "^4.19.2",
|
| 33 |
+
"firebase-admin": "^12.1.0",
|
| 34 |
+
"firebase-functions": "^6.1.1",
|
| 35 |
+
"htmlparser2": "^9.0.0",
|
| 36 |
+
"jose": "^5.1.0",
|
| 37 |
+
"langdetect": "^0.2.1",
|
| 38 |
+
"linkedom": "^0.18.4",
|
| 39 |
+
"maxmind": "^4.3.18",
|
| 40 |
+
"minio": "^7.1.3",
|
| 41 |
+
"node-libcurl": "^4.1.0",
|
| 42 |
+
"openai": "^4.20.0",
|
| 43 |
+
"pdfjs-dist": "^4.10.38",
|
| 44 |
+
"puppeteer": "^23.3.0",
|
| 45 |
+
"puppeteer-extra": "^3.3.6",
|
| 46 |
+
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 47 |
+
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
| 48 |
+
"puppeteer-page-proxy": "^1.3.0",
|
| 49 |
+
"robots-parser": "^3.0.1",
|
| 50 |
+
"set-cookie-parser": "^2.6.0",
|
| 51 |
+
"simple-zstd": "^1.4.2",
|
| 52 |
+
"stripe": "^11.11.0",
|
| 53 |
+
"tiktoken": "^1.0.16",
|
| 54 |
+
"tld-extract": "^2.1.0",
|
| 55 |
+
"turndown": "^7.1.3",
|
| 56 |
+
"turndown-plugin-gfm": "^1.0.2",
|
| 57 |
+
"undici": "^5.24.0"
|
| 58 |
+
},
|
| 59 |
+
"devDependencies": {
|
| 60 |
+
"@types/archiver": "^5.3.4",
|
| 61 |
+
"@types/bcrypt": "^5.0.0",
|
| 62 |
+
"@types/busboy": "^1.5.4",
|
| 63 |
+
"@types/cors": "^2.8.17",
|
| 64 |
+
"@types/generic-pool": "^3.8.1",
|
| 65 |
+
"@types/koa": "^2.15.0",
|
| 66 |
+
"@types/node": "^20.14.13",
|
| 67 |
+
"@types/set-cookie-parser": "^2.4.7",
|
| 68 |
+
"@types/xmldom": "^0.1.34",
|
| 69 |
+
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
| 70 |
+
"@typescript-eslint/parser": "^5.12.0",
|
| 71 |
+
"eslint": "^8.9.0",
|
| 72 |
+
"eslint-config-google": "^0.14.0",
|
| 73 |
+
"eslint-plugin-import": "^2.25.4",
|
| 74 |
+
"firebase-functions-test": "^3.0.0",
|
| 75 |
+
"koa": "^2.16.0",
|
| 76 |
+
"pino-pretty": "^13.0.0",
|
| 77 |
+
"replicate": "^0.16.1",
|
| 78 |
+
"typescript": "^5.5.4"
|
| 79 |
+
},
|
| 80 |
+
"private": true,
|
| 81 |
+
"exports": {
|
| 82 |
+
".": "./build/index.js"
|
| 83 |
+
}
|
| 84 |
+
}
|
|
File without changes
|
|
@@ -1,30 +1,45 @@
|
|
| 1 |
-
import {
|
| 2 |
-
assignTransferProtocolMeta, marshalErrorLike,
|
| 3 |
-
RPCHost, RPCReflection,
|
| 4 |
-
AssertionFailureError, ParamValidationError, Defer,
|
| 5 |
-
} from 'civkit';
|
| 6 |
import { singleton } from 'tsyringe';
|
| 7 |
-
import {
|
| 8 |
-
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 9 |
-
import _ from 'lodash';
|
| 10 |
-
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
| 11 |
-
import { Request, Response } from 'express';
|
| 12 |
-
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
| 13 |
-
import { Crawled } from '../db/crawled';
|
| 14 |
import { randomUUID } from 'crypto';
|
| 15 |
-
import
|
| 16 |
|
| 17 |
-
import {
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 21 |
import { DomainProfile } from '../db/domain-profile';
|
| 22 |
-
import {
|
|
|
|
|
|
|
| 23 |
import { JSDomControl } from '../services/jsdom';
|
| 24 |
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
| 25 |
import { CurlControl } from '../services/curl';
|
| 26 |
import { LmControl } from '../services/lm';
|
| 27 |
import { tryDecodeURIComponent } from '../utils/misc';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 30 |
withIframe?: boolean | 'quoted';
|
|
@@ -33,6 +48,8 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
|
| 33 |
removeSelector?: string | string[];
|
| 34 |
keepImgDataUrl?: boolean;
|
| 35 |
engine?: string;
|
|
|
|
|
|
|
| 36 |
}
|
| 37 |
|
| 38 |
const indexProto = {
|
|
@@ -56,16 +73,18 @@ export class CrawlerHost extends RPCHost {
|
|
| 56 |
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
|
| 57 |
|
| 58 |
constructor(
|
| 59 |
-
protected globalLogger:
|
| 60 |
protected puppeteerControl: PuppeteerControl,
|
| 61 |
protected curlControl: CurlControl,
|
|
|
|
|
|
|
| 62 |
protected lmControl: LmControl,
|
| 63 |
protected jsdomControl: JSDomControl,
|
| 64 |
protected snapshotFormatter: SnapshotFormatter,
|
| 65 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 66 |
protected rateLimitControl: RateLimitControl,
|
| 67 |
-
protected threadLocal:
|
| 68 |
-
protected
|
| 69 |
) {
|
| 70 |
super(...arguments);
|
| 71 |
|
|
@@ -73,7 +92,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 73 |
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
| 74 |
return;
|
| 75 |
}
|
| 76 |
-
if (options.cookies?.length) {
|
| 77 |
// Potential privacy issue, dont cache if cookies are used
|
| 78 |
return;
|
| 79 |
}
|
|
@@ -84,9 +103,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 84 |
if (options.locale) {
|
| 85 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 86 |
}
|
| 87 |
-
await this.setToCache(options.url, snapshot);
|
| 88 |
|
| 89 |
-
await this.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
});
|
| 91 |
|
| 92 |
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
|
@@ -108,12 +132,19 @@ export class CrawlerHost extends RPCHost {
|
|
| 108 |
override async init() {
|
| 109 |
await this.dependencyReady();
|
| 110 |
|
|
|
|
|
|
|
| 111 |
this.emit('ready');
|
| 112 |
}
|
| 113 |
|
| 114 |
-
getIndex(
|
| 115 |
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
Object.assign(indexObject, {
|
| 118 |
usage1: 'https://r.jina.ai/YOUR_URL',
|
| 119 |
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
|
|
@@ -121,71 +152,83 @@ export class CrawlerHost extends RPCHost {
|
|
| 121 |
sourceCode: 'https://github.com/jina-ai/reader',
|
| 122 |
});
|
| 123 |
|
| 124 |
-
|
|
|
|
| 125 |
indexObject[''] = undefined;
|
| 126 |
-
indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
|
| 127 |
-
indexObject.balanceLeft = user.wallet.total_balance;
|
| 128 |
}
|
| 129 |
|
| 130 |
return indexObject;
|
| 131 |
}
|
| 132 |
|
| 133 |
-
@
|
| 134 |
-
name: '
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
| 139 |
},
|
| 140 |
-
tags: ['
|
| 141 |
-
|
| 142 |
-
returnType: [String, OutputServerEventStream],
|
| 143 |
-
exposeRoot: true,
|
| 144 |
})
|
| 145 |
-
@
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
},
|
| 154 |
-
tags: ['
|
| 155 |
-
httpMethod: ['get', 'post'],
|
| 156 |
returnType: [String, OutputServerEventStream],
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
})
|
| 159 |
async crawl(
|
| 160 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 161 |
-
@Ctx() ctx:
|
| 162 |
-
req: Request,
|
| 163 |
-
res: Response,
|
| 164 |
-
},
|
| 165 |
auth: JinaEmbeddingsAuthDTO,
|
| 166 |
crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
|
| 167 |
crawlerOptionsParamsAllowed: CrawlerOptions,
|
| 168 |
) {
|
| 169 |
const uid = await auth.solveUID();
|
| 170 |
let chargeAmount = 0;
|
| 171 |
-
const crawlerOptions = ctx.
|
| 172 |
|
| 173 |
-
|
| 174 |
-
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.req.url), crawlerOptions);
|
| 175 |
if (!targetUrl) {
|
| 176 |
-
|
| 177 |
-
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 178 |
-
return this.getIndex(latestUser);
|
| 179 |
-
}
|
| 180 |
-
|
| 181 |
-
return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
|
| 182 |
-
{ contentType: 'text/plain', envelope: null }
|
| 183 |
-
);
|
| 184 |
}
|
| 185 |
|
| 186 |
// Prevent circular crawling
|
| 187 |
this.puppeteerControl.circuitBreakerHosts.add(
|
| 188 |
-
ctx.
|
| 189 |
);
|
| 190 |
|
| 191 |
if (uid) {
|
|
@@ -222,8 +265,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 222 |
apiRoll.chargeAmount = chargeAmount;
|
| 223 |
}
|
| 224 |
});
|
| 225 |
-
} else if (ctx.
|
| 226 |
-
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.
|
| 227 |
[
|
| 228 |
// 20 requests per minute
|
| 229 |
new Date(Date.now() - 60 * 1000), 20
|
|
@@ -254,9 +297,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 254 |
}
|
| 255 |
}
|
| 256 |
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
const crawlOpts = await this.configure(crawlerOptions);
|
| 259 |
-
if (!ctx.
|
| 260 |
const sseStream = new OutputServerEventStream();
|
| 261 |
rpcReflect.return(sseStream);
|
| 262 |
|
|
@@ -265,8 +311,11 @@ export class CrawlerHost extends RPCHost {
|
|
| 265 |
if (!scrapped) {
|
| 266 |
continue;
|
| 267 |
}
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
-
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 270 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 271 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 272 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
@@ -293,17 +342,20 @@ export class CrawlerHost extends RPCHost {
|
|
| 293 |
}
|
| 294 |
|
| 295 |
let lastScrapped;
|
| 296 |
-
if (!ctx.
|
| 297 |
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 298 |
lastScrapped = scrapped;
|
|
|
|
|
|
|
|
|
|
| 299 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
| 300 |
continue;
|
| 301 |
}
|
| 302 |
-
if (crawlerOptions.waitForSelector ||
|
| 303 |
continue;
|
| 304 |
}
|
| 305 |
|
| 306 |
-
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 307 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 308 |
|
| 309 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
|
@@ -324,7 +376,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 324 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 325 |
}
|
| 326 |
|
| 327 |
-
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
| 328 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 329 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 330 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
@@ -342,16 +394,18 @@ export class CrawlerHost extends RPCHost {
|
|
| 342 |
|
| 343 |
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 344 |
lastScrapped = scrapped;
|
| 345 |
-
|
|
|
|
|
|
|
| 346 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
| 347 |
continue;
|
| 348 |
}
|
| 349 |
|
| 350 |
-
if (crawlerOptions.waitForSelector ||
|
| 351 |
continue;
|
| 352 |
}
|
| 353 |
|
| 354 |
-
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
| 355 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 356 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 357 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
@@ -370,7 +424,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 370 |
);
|
| 371 |
}
|
| 372 |
|
| 373 |
-
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
| 374 |
}
|
| 375 |
|
| 376 |
if (!lastScrapped) {
|
|
@@ -380,7 +434,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 380 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 381 |
}
|
| 382 |
|
| 383 |
-
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
| 384 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 385 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 386 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
@@ -399,7 +453,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 399 |
);
|
| 400 |
}
|
| 401 |
|
| 402 |
-
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
| 403 |
|
| 404 |
}
|
| 405 |
|
|
@@ -419,7 +473,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 419 |
}
|
| 420 |
|
| 421 |
let result: URL;
|
| 422 |
-
const normalizeUrl = (
|
| 423 |
try {
|
| 424 |
result = new URL(
|
| 425 |
normalizeUrl(
|
|
@@ -638,7 +692,25 @@ export class CrawlerHost extends RPCHost {
|
|
| 638 |
}
|
| 639 |
|
| 640 |
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
| 641 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
return;
|
| 643 |
}
|
| 644 |
|
|
@@ -653,27 +725,69 @@ export class CrawlerHost extends RPCHost {
|
|
| 653 |
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
| 654 |
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
| 655 |
) {
|
|
|
|
|
|
|
|
|
|
| 656 |
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 657 |
|
| 658 |
return;
|
| 659 |
}
|
| 660 |
|
| 661 |
-
|
| 662 |
-
const
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
|
| 671 |
-
return;
|
| 672 |
}
|
| 673 |
-
|
| 674 |
-
this.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
}
|
| 676 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
}
|
| 678 |
|
| 679 |
try {
|
|
@@ -782,6 +896,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 782 |
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
| 783 |
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
| 784 |
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
|
|
|
|
|
|
| 785 |
this.threadLocal.set('userAgent', opts.userAgent);
|
| 786 |
if (opts.timeout) {
|
| 787 |
this.threadLocal.set('timeout', opts.timeout * 1000);
|
|
@@ -804,6 +920,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 804 |
referer: opts.referer,
|
| 805 |
viewport: opts.viewport,
|
| 806 |
engine: opts.engine,
|
|
|
|
|
|
|
|
|
|
| 807 |
};
|
| 808 |
|
| 809 |
if (opts.locale) {
|
|
@@ -842,14 +961,15 @@ export class CrawlerHost extends RPCHost {
|
|
| 842 |
return crawlOpts;
|
| 843 |
}
|
| 844 |
|
| 845 |
-
formatSnapshot(
|
| 846 |
crawlerOptions: CrawlerOptions,
|
| 847 |
snapshot: PageSnapshot & {
|
| 848 |
screenshotUrl?: string;
|
| 849 |
pageshotUrl?: string;
|
| 850 |
},
|
| 851 |
nominalUrl?: URL,
|
| 852 |
-
urlValidMs?: number
|
|
|
|
| 853 |
) {
|
| 854 |
const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;
|
| 855 |
|
|
@@ -870,7 +990,29 @@ export class CrawlerHost extends RPCHost {
|
|
| 870 |
return output;
|
| 871 |
}
|
| 872 |
|
| 873 |
-
return this.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 874 |
}
|
| 875 |
|
| 876 |
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
|
@@ -967,6 +1109,26 @@ export class CrawlerHost extends RPCHost {
|
|
| 967 |
return;
|
| 968 |
}
|
| 969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
getDomainProfileUrlDigest(url: URL) {
|
| 971 |
const pathname = url.pathname;
|
| 972 |
const pathVec = pathname.split('/');
|
|
@@ -981,4 +1143,29 @@ export class CrawlerHost extends RPCHost {
|
|
| 981 |
path: finalPath,
|
| 982 |
};
|
| 983 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 984 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import { singleton } from 'tsyringe';
|
| 2 |
+
import { pathToFileURL } from 'url';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import { randomUUID } from 'crypto';
|
| 4 |
+
import _ from 'lodash';
|
| 5 |
|
| 6 |
+
import {
|
| 7 |
+
assignTransferProtocolMeta, RPCHost, RPCReflection,
|
| 8 |
+
AssertionFailureError, ParamValidationError,
|
| 9 |
+
RawString,
|
| 10 |
+
ApplicationError,
|
| 11 |
+
} from 'civkit/civ-rpc';
|
| 12 |
+
import { marshalErrorLike } from 'civkit/lang';
|
| 13 |
+
import { Defer } from 'civkit/defer';
|
| 14 |
+
import { retryWith } from 'civkit/decorators';
|
| 15 |
+
|
| 16 |
+
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
| 17 |
+
|
| 18 |
+
import { Crawled } from '../db/crawled';
|
| 19 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 20 |
import { DomainProfile } from '../db/domain-profile';
|
| 21 |
+
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
| 22 |
+
|
| 23 |
+
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
| 24 |
import { JSDomControl } from '../services/jsdom';
|
| 25 |
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
| 26 |
import { CurlControl } from '../services/curl';
|
| 27 |
import { LmControl } from '../services/lm';
|
| 28 |
import { tryDecodeURIComponent } from '../utils/misc';
|
| 29 |
+
import { CFBrowserRendering } from '../services/cf-browser-rendering';
|
| 30 |
+
|
| 31 |
+
import { GlobalLogger } from '../services/logger';
|
| 32 |
+
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 33 |
+
import { AsyncLocalContext } from '../services/async-context';
|
| 34 |
+
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
| 35 |
+
import { BudgetExceededError, InsufficientBalanceError, SecurityCompromiseError } from '../services/errors';
|
| 36 |
+
|
| 37 |
+
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 38 |
+
import { ProxyProvider } from '../shared/services/proxy-provider';
|
| 39 |
+
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 40 |
+
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 41 |
+
import { RobotsTxtService } from '../services/robots-text';
|
| 42 |
+
import { ServiceBadAttemptError } from '../shared/lib/errors';
|
| 43 |
|
| 44 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 45 |
withIframe?: boolean | 'quoted';
|
|
|
|
| 48 |
removeSelector?: string | string[];
|
| 49 |
keepImgDataUrl?: boolean;
|
| 50 |
engine?: string;
|
| 51 |
+
allocProxy?: string;
|
| 52 |
+
private?: boolean;
|
| 53 |
}
|
| 54 |
|
| 55 |
const indexProto = {
|
|
|
|
| 73 |
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
|
| 74 |
|
| 75 |
constructor(
|
| 76 |
+
protected globalLogger: GlobalLogger,
|
| 77 |
protected puppeteerControl: PuppeteerControl,
|
| 78 |
protected curlControl: CurlControl,
|
| 79 |
+
protected cfBrowserRendering: CFBrowserRendering,
|
| 80 |
+
protected proxyProvider: ProxyProvider,
|
| 81 |
protected lmControl: LmControl,
|
| 82 |
protected jsdomControl: JSDomControl,
|
| 83 |
protected snapshotFormatter: SnapshotFormatter,
|
| 84 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 85 |
protected rateLimitControl: RateLimitControl,
|
| 86 |
+
protected threadLocal: AsyncLocalContext,
|
| 87 |
+
protected robotsTxtService: RobotsTxtService,
|
| 88 |
) {
|
| 89 |
super(...arguments);
|
| 90 |
|
|
|
|
| 92 |
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
| 93 |
return;
|
| 94 |
}
|
| 95 |
+
if (options.cookies?.length || options.private) {
|
| 96 |
// Potential privacy issue, dont cache if cookies are used
|
| 97 |
return;
|
| 98 |
}
|
|
|
|
| 103 |
if (options.locale) {
|
| 104 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 105 |
}
|
|
|
|
| 106 |
|
| 107 |
+
const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
|
| 108 |
+
if (analyzed.tokens < 200) {
|
| 109 |
+
// Does not contain enough content
|
| 110 |
+
return;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
await this.setToCache(options.url, snapshot);
|
| 114 |
});
|
| 115 |
|
| 116 |
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
|
|
|
| 132 |
override async init() {
|
| 133 |
await this.dependencyReady();
|
| 134 |
|
| 135 |
+
this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, ''));
|
| 136 |
+
|
| 137 |
this.emit('ready');
|
| 138 |
}
|
| 139 |
|
| 140 |
+
async getIndex(auth?: JinaEmbeddingsAuthDTO) {
|
| 141 |
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
| 142 |
+
// Object.assign(indexObject, {
|
| 143 |
+
// usage1: `${ctx.origin}/YOUR_URL`,
|
| 144 |
+
// usage2: `${ctx.origin}/search/YOUR_SEARCH_QUERY`,
|
| 145 |
+
// homepage: 'https://jina.ai/reader',
|
| 146 |
+
// sourceCode: 'https://github.com/jina-ai/reader',
|
| 147 |
+
// });
|
| 148 |
Object.assign(indexObject, {
|
| 149 |
usage1: 'https://r.jina.ai/YOUR_URL',
|
| 150 |
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
|
|
|
|
| 152 |
sourceCode: 'https://github.com/jina-ai/reader',
|
| 153 |
});
|
| 154 |
|
| 155 |
+
await auth?.solveUID();
|
| 156 |
+
if (auth && auth.user) {
|
| 157 |
indexObject[''] = undefined;
|
| 158 |
+
indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
|
| 159 |
+
indexObject.balanceLeft = auth.user.wallet.total_balance;
|
| 160 |
}
|
| 161 |
|
| 162 |
return indexObject;
|
| 163 |
}
|
| 164 |
|
| 165 |
+
@Method({
|
| 166 |
+
name: 'getIndex',
|
| 167 |
+
description: 'Index of the service',
|
| 168 |
+
proto: {
|
| 169 |
+
http: {
|
| 170 |
+
action: 'get',
|
| 171 |
+
path: '/',
|
| 172 |
+
}
|
| 173 |
},
|
| 174 |
+
tags: ['misc', 'crawl'],
|
| 175 |
+
returnType: [String, Object],
|
|
|
|
|
|
|
| 176 |
})
|
| 177 |
+
async getIndexCtrl(@Ctx() ctx: Context, @Param({ required: false }) auth?: JinaEmbeddingsAuthDTO) {
|
| 178 |
+
const indexObject = await this.getIndex(auth);
|
| 179 |
+
|
| 180 |
+
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 181 |
+
return indexObject;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
return assignTransferProtocolMeta(`${indexObject}`,
|
| 185 |
+
{ contentType: 'text/plain; charset=utf-8', envelope: null }
|
| 186 |
+
);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
@Method({
|
| 191 |
+
name: 'crawlByPostingToIndex',
|
| 192 |
+
description: 'Crawl any url into markdown',
|
| 193 |
+
proto: {
|
| 194 |
+
http: {
|
| 195 |
+
action: 'POST',
|
| 196 |
+
path: '/',
|
| 197 |
+
}
|
| 198 |
},
|
| 199 |
+
tags: ['crawl'],
|
|
|
|
| 200 |
returnType: [String, OutputServerEventStream],
|
| 201 |
+
})
|
| 202 |
+
@Method({
|
| 203 |
+
description: 'Crawl any url into markdown',
|
| 204 |
+
proto: {
|
| 205 |
+
http: {
|
| 206 |
+
action: ['GET', 'POST'],
|
| 207 |
+
path: '::url',
|
| 208 |
+
}
|
| 209 |
+
},
|
| 210 |
+
tags: ['crawl'],
|
| 211 |
+
returnType: [String, OutputServerEventStream, RawString],
|
| 212 |
})
|
| 213 |
async crawl(
|
| 214 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 215 |
+
@Ctx() ctx: Context,
|
|
|
|
|
|
|
|
|
|
| 216 |
auth: JinaEmbeddingsAuthDTO,
|
| 217 |
crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
|
| 218 |
crawlerOptionsParamsAllowed: CrawlerOptions,
|
| 219 |
) {
|
| 220 |
const uid = await auth.solveUID();
|
| 221 |
let chargeAmount = 0;
|
| 222 |
+
const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
|
| 223 |
|
| 224 |
+
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.path), crawlerOptions);
|
|
|
|
| 225 |
if (!targetUrl) {
|
| 226 |
+
return await this.getIndex(auth);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
}
|
| 228 |
|
| 229 |
// Prevent circular crawling
|
| 230 |
this.puppeteerControl.circuitBreakerHosts.add(
|
| 231 |
+
ctx.hostname.toLowerCase()
|
| 232 |
);
|
| 233 |
|
| 234 |
if (uid) {
|
|
|
|
| 265 |
apiRoll.chargeAmount = chargeAmount;
|
| 266 |
}
|
| 267 |
});
|
| 268 |
+
} else if (ctx.ip) {
|
| 269 |
+
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, [rpcReflect.name.toUpperCase()],
|
| 270 |
[
|
| 271 |
// 20 requests per minute
|
| 272 |
new Date(Date.now() - 60 * 1000), 20
|
|
|
|
| 297 |
}
|
| 298 |
}
|
| 299 |
|
| 300 |
+
if (crawlerOptions.robotsTxt) {
|
| 301 |
+
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
| 302 |
+
}
|
| 303 |
|
| 304 |
const crawlOpts = await this.configure(crawlerOptions);
|
| 305 |
+
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
| 306 |
const sseStream = new OutputServerEventStream();
|
| 307 |
rpcReflect.return(sseStream);
|
| 308 |
|
|
|
|
| 311 |
if (!scrapped) {
|
| 312 |
continue;
|
| 313 |
}
|
| 314 |
+
if (rpcReflect.signal.aborted) {
|
| 315 |
+
break;
|
| 316 |
+
}
|
| 317 |
|
| 318 |
+
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 319 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 320 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 321 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
|
|
| 342 |
}
|
| 343 |
|
| 344 |
let lastScrapped;
|
| 345 |
+
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 346 |
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 347 |
lastScrapped = scrapped;
|
| 348 |
+
if (rpcReflect.signal.aborted) {
|
| 349 |
+
break;
|
| 350 |
+
}
|
| 351 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
| 352 |
continue;
|
| 353 |
}
|
| 354 |
+
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
| 355 |
continue;
|
| 356 |
}
|
| 357 |
|
| 358 |
+
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 359 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 360 |
|
| 361 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
|
|
|
| 376 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 377 |
}
|
| 378 |
|
| 379 |
+
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 380 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 381 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 382 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
|
|
| 394 |
|
| 395 |
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
| 396 |
lastScrapped = scrapped;
|
| 397 |
+
if (rpcReflect.signal.aborted) {
|
| 398 |
+
break;
|
| 399 |
+
}
|
| 400 |
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
| 401 |
continue;
|
| 402 |
}
|
| 403 |
|
| 404 |
+
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
| 405 |
continue;
|
| 406 |
}
|
| 407 |
|
| 408 |
+
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 409 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 410 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 411 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
|
|
| 424 |
);
|
| 425 |
}
|
| 426 |
|
| 427 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
|
| 428 |
}
|
| 429 |
|
| 430 |
if (!lastScrapped) {
|
|
|
|
| 434 |
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
| 435 |
}
|
| 436 |
|
| 437 |
+
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
| 438 |
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
| 439 |
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
| 440 |
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
|
|
|
| 453 |
);
|
| 454 |
}
|
| 455 |
|
| 456 |
+
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
|
| 457 |
|
| 458 |
}
|
| 459 |
|
|
|
|
| 473 |
}
|
| 474 |
|
| 475 |
let result: URL;
|
| 476 |
+
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
| 477 |
try {
|
| 478 |
result = new URL(
|
| 479 |
normalizeUrl(
|
|
|
|
| 692 |
}
|
| 693 |
|
| 694 |
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
| 695 |
+
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
| 696 |
+
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
|
| 697 |
+
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
|
| 698 |
+
if (!sideLoaded.file) {
|
| 699 |
+
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 700 |
+
}
|
| 701 |
+
const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
| 702 |
+
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
| 703 |
+
return;
|
| 704 |
+
}
|
| 705 |
+
if (crawlOpts?.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) {
|
| 706 |
+
const html = await this.cfBrowserRendering.fetchContent(urlToCrawl.href);
|
| 707 |
+
const snapshot = {
|
| 708 |
+
href: urlToCrawl.toString(),
|
| 709 |
+
html,
|
| 710 |
+
title: '',
|
| 711 |
+
text: '',
|
| 712 |
+
} as PageSnapshot;
|
| 713 |
+
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
| 714 |
return;
|
| 715 |
}
|
| 716 |
|
|
|
|
| 725 |
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
| 726 |
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
| 727 |
) {
|
| 728 |
+
if (cache.snapshot) {
|
| 729 |
+
cache.snapshot.isFromCache = true;
|
| 730 |
+
}
|
| 731 |
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
| 732 |
|
| 733 |
return;
|
| 734 |
}
|
| 735 |
|
| 736 |
+
try {
|
| 737 |
+
const altOpts = { ...crawlOpts };
|
| 738 |
+
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
| 739 |
+
await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
|
| 740 |
+
await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
|
| 741 |
+
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
| 742 |
+
|
| 743 |
+
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
| 744 |
+
return Promise.reject(err);
|
|
|
|
|
|
|
| 745 |
}
|
| 746 |
+
|
| 747 |
+
return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
| 748 |
+
});
|
| 749 |
+
if (!sideLoaded.file) {
|
| 750 |
+
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 751 |
+
}
|
| 752 |
+
let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
| 753 |
+
if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
|
| 754 |
+
yield draftSnapshot;
|
| 755 |
+
return;
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 759 |
+
draftSnapshot.title ??= analyzed.title;
|
| 760 |
+
let fallbackProxyIsUsed = false;
|
| 761 |
+
if ((!crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200)) {
|
| 762 |
+
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
| 763 |
+
if (!proxyLoaded.file) {
|
| 764 |
+
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 765 |
+
}
|
| 766 |
+
const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName);
|
| 767 |
+
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 768 |
+
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 769 |
+
draftSnapshot = proxySnapshot;
|
| 770 |
+
sideLoaded = proxyLoaded;
|
| 771 |
+
fallbackProxyIsUsed = true;
|
| 772 |
+
}
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
| 776 |
+
yield draftSnapshot;
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
|
| 780 |
+
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
|
| 781 |
+
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
|
| 782 |
+
if (fallbackProxyIsUsed) {
|
| 783 |
+
this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
|
| 784 |
}
|
| 785 |
}
|
| 786 |
+
} catch (err: any) {
|
| 787 |
+
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
| 788 |
+
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
| 789 |
+
throw err;
|
| 790 |
+
}
|
| 791 |
}
|
| 792 |
|
| 793 |
try {
|
|
|
|
| 896 |
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
| 897 |
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
| 898 |
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
| 899 |
+
this.threadLocal.set('withIframe', opts.withIframe);
|
| 900 |
+
this.threadLocal.set('withShadowDom', opts.withShadowDom);
|
| 901 |
this.threadLocal.set('userAgent', opts.userAgent);
|
| 902 |
if (opts.timeout) {
|
| 903 |
this.threadLocal.set('timeout', opts.timeout * 1000);
|
|
|
|
| 920 |
referer: opts.referer,
|
| 921 |
viewport: opts.viewport,
|
| 922 |
engine: opts.engine,
|
| 923 |
+
allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
|
| 924 |
+
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
| 925 |
+
private: Boolean(opts.doNotTrack),
|
| 926 |
};
|
| 927 |
|
| 928 |
if (opts.locale) {
|
|
|
|
| 961 |
return crawlOpts;
|
| 962 |
}
|
| 963 |
|
| 964 |
+
protected async formatSnapshot(
|
| 965 |
crawlerOptions: CrawlerOptions,
|
| 966 |
snapshot: PageSnapshot & {
|
| 967 |
screenshotUrl?: string;
|
| 968 |
pageshotUrl?: string;
|
| 969 |
},
|
| 970 |
nominalUrl?: URL,
|
| 971 |
+
urlValidMs?: number,
|
| 972 |
+
scrappingOptions?: ScrappingOptions
|
| 973 |
) {
|
| 974 |
const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;
|
| 975 |
|
|
|
|
| 990 |
return output;
|
| 991 |
}
|
| 992 |
|
| 993 |
+
return this.formatSnapshotWithPDFSideLoad(respondWith, snapshot, presumedURL, urlValidMs, scrappingOptions);
|
| 994 |
+
}
|
| 995 |
+
|
| 996 |
+
async formatSnapshotWithPDFSideLoad(mode: string, snapshot: PageSnapshot, nominalUrl?: URL, urlValidMs?: number, scrappingOptions?: ScrappingOptions) {
|
| 997 |
+
const snapshotCopy = _.cloneDeep(snapshot);
|
| 998 |
+
|
| 999 |
+
if (snapshotCopy.pdfs?.length) {
|
| 1000 |
+
const pdfUrl = snapshotCopy.pdfs[0];
|
| 1001 |
+
if (pdfUrl.startsWith('http')) {
|
| 1002 |
+
const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
|
| 1003 |
+
if (sideLoaded?.body) {
|
| 1004 |
+
snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
|
| 1005 |
+
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
|
| 1006 |
+
}
|
| 1007 |
+
|
| 1008 |
+
const r = await this.curlControl.sideLoad(new URL(pdfUrl), scrappingOptions);
|
| 1009 |
+
if (r.file) {
|
| 1010 |
+
snapshotCopy.pdfs[0] = pathToFileURL(await r.file.filePath).href;
|
| 1011 |
+
}
|
| 1012 |
+
}
|
| 1013 |
+
}
|
| 1014 |
+
|
| 1015 |
+
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
|
| 1016 |
}
|
| 1017 |
|
| 1018 |
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
|
|
|
| 1109 |
return;
|
| 1110 |
}
|
| 1111 |
|
| 1112 |
+
async snapshotNotGoodEnough(snapshot: PageSnapshot) {
|
| 1113 |
+
if (snapshot.pdfs?.length) {
|
| 1114 |
+
return false;
|
| 1115 |
+
}
|
| 1116 |
+
if (!snapshot.title) {
|
| 1117 |
+
return true;
|
| 1118 |
+
}
|
| 1119 |
+
if (snapshot.parsed?.content) {
|
| 1120 |
+
return false;
|
| 1121 |
+
}
|
| 1122 |
+
if (snapshot.html) {
|
| 1123 |
+
const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
|
| 1124 |
+
const tokens = r.tokens;
|
| 1125 |
+
if (tokens < 200) {
|
| 1126 |
+
return true;
|
| 1127 |
+
}
|
| 1128 |
+
}
|
| 1129 |
+
return false;
|
| 1130 |
+
}
|
| 1131 |
+
|
| 1132 |
getDomainProfileUrlDigest(url: URL) {
|
| 1133 |
const pathname = url.pathname;
|
| 1134 |
const pathVec = pathname.split('/');
|
|
|
|
| 1143 |
path: finalPath,
|
| 1144 |
};
|
| 1145 |
}
|
| 1146 |
+
|
| 1147 |
+
@retryWith((err) => {
|
| 1148 |
+
if (err instanceof ServiceBadAttemptError) {
|
| 1149 |
+
// Keep trying
|
| 1150 |
+
return true;
|
| 1151 |
+
}
|
| 1152 |
+
if (err instanceof ApplicationError) {
|
| 1153 |
+
// Quit with this error
|
| 1154 |
+
return false;
|
| 1155 |
+
}
|
| 1156 |
+
return undefined;
|
| 1157 |
+
}, 3)
|
| 1158 |
+
async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) {
|
| 1159 |
+
const proxy = await this.proxyProvider.alloc(opts?.allocProxy);
|
| 1160 |
+
const r = await this.curlControl.sideLoad(url, {
|
| 1161 |
+
...opts,
|
| 1162 |
+
proxyUrl: proxy.href,
|
| 1163 |
+
});
|
| 1164 |
+
|
| 1165 |
+
if (opts && opts.allocProxy) {
|
| 1166 |
+
opts.proxyUrl ??= proxy.href;
|
| 1167 |
+
}
|
| 1168 |
+
|
| 1169 |
+
return { ...r, proxy };
|
| 1170 |
+
}
|
| 1171 |
}
|
|
@@ -1,21 +1,25 @@
|
|
| 1 |
-
import {
|
| 2 |
-
assignTransferProtocolMeta, marshalErrorLike,
|
| 3 |
-
RPCHost, RPCReflection,
|
| 4 |
-
AssertionFailureError,
|
| 5 |
-
objHashMd5B64Of,
|
| 6 |
-
assignMeta,
|
| 7 |
-
} from 'civkit';
|
| 8 |
import { singleton } from 'tsyringe';
|
| 9 |
-
import {
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
import _ from 'lodash';
|
| 12 |
-
|
| 13 |
-
import {
|
|
|
|
| 14 |
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
| 15 |
import { SerperSearchResult } from '../db/searched';
|
| 16 |
-
import { CrawlerOptions } from '../dto/
|
| 17 |
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
| 18 |
import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
| 20 |
|
| 21 |
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
|
|
@@ -33,9 +37,9 @@ export class SearcherHost extends RPCHost {
|
|
| 33 |
targetResultCount = 5;
|
| 34 |
|
| 35 |
constructor(
|
| 36 |
-
protected globalLogger:
|
| 37 |
protected rateLimitControl: RateLimitControl,
|
| 38 |
-
protected threadLocal:
|
| 39 |
protected serperSearchService: SerperSearchService,
|
| 40 |
protected crawler: CrawlerHost,
|
| 41 |
protected snapshotFormatter: SnapshotFormatter,
|
|
@@ -49,39 +53,30 @@ export class SearcherHost extends RPCHost {
|
|
| 49 |
this.emit('ready');
|
| 50 |
}
|
| 51 |
|
| 52 |
-
@
|
| 53 |
-
name: '
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
},
|
| 60 |
-
tags: ['
|
| 61 |
-
httpMethod: ['get', 'post'],
|
| 62 |
returnType: [String, OutputServerEventStream],
|
| 63 |
-
exposeRoot: true,
|
| 64 |
})
|
| 65 |
-
@
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
maxInstances: 200,
|
| 72 |
-
minInstances: 1,
|
| 73 |
},
|
| 74 |
-
tags: ['
|
| 75 |
-
|
| 76 |
-
returnType: [String, OutputServerEventStream],
|
| 77 |
-
exposeRoot: true,
|
| 78 |
})
|
| 79 |
async search(
|
| 80 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 81 |
-
@Ctx() ctx:
|
| 82 |
-
req: Request,
|
| 83 |
-
res: Response,
|
| 84 |
-
},
|
| 85 |
auth: JinaEmbeddingsAuthDTO,
|
| 86 |
crawlerOptions: CrawlerOptions,
|
| 87 |
searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
|
|
@@ -102,19 +97,17 @@ export class SearcherHost extends RPCHost {
|
|
| 102 |
|
| 103 |
const uid = await auth.solveUID();
|
| 104 |
// Return content by default
|
| 105 |
-
const
|
| 106 |
-
const
|
| 107 |
-
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
|
| 108 |
|
| 109 |
let chargeAmount = 0;
|
| 110 |
-
const noSlashPath = decodeURIComponent(ctx.
|
| 111 |
if (!noSlashPath && !q) {
|
| 112 |
-
const
|
| 113 |
-
const index = this.crawler.getIndex(latestUser);
|
| 114 |
if (!uid) {
|
| 115 |
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
| 116 |
}
|
| 117 |
-
if (!ctx.
|
| 118 |
|
| 119 |
return index;
|
| 120 |
}
|
|
@@ -189,7 +182,7 @@ export class SearcherHost extends RPCHost {
|
|
| 189 |
chargeAmount = 10000;
|
| 190 |
}
|
| 191 |
this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
|
| 192 |
-
if ((!ctx.
|
| 193 |
return lastScrapped;
|
| 194 |
}
|
| 195 |
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
|
@@ -201,7 +194,7 @@ export class SearcherHost extends RPCHost {
|
|
| 201 |
withFavicon
|
| 202 |
);
|
| 203 |
|
| 204 |
-
if (!ctx.
|
| 205 |
const sseStream = new OutputServerEventStream();
|
| 206 |
rpcReflect.return(sseStream);
|
| 207 |
|
|
@@ -210,6 +203,9 @@ export class SearcherHost extends RPCHost {
|
|
| 210 |
if (!scrapped) {
|
| 211 |
continue;
|
| 212 |
}
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
chargeAmount = this.assignChargeAmount(scrapped);
|
| 215 |
sseStream.write({
|
|
@@ -233,7 +229,7 @@ export class SearcherHost extends RPCHost {
|
|
| 233 |
}
|
| 234 |
|
| 235 |
let earlyReturn = false;
|
| 236 |
-
if (!ctx.
|
| 237 |
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
| 238 |
const setEarlyReturnTimer = () => {
|
| 239 |
if (earlyReturnTimer) {
|
|
@@ -251,6 +247,9 @@ export class SearcherHost extends RPCHost {
|
|
| 251 |
|
| 252 |
for await (const scrapped of it) {
|
| 253 |
lastScrapped = scrapped;
|
|
|
|
|
|
|
|
|
|
| 254 |
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 255 |
setEarlyReturnTimer();
|
| 256 |
}
|
|
@@ -299,7 +298,9 @@ export class SearcherHost extends RPCHost {
|
|
| 299 |
|
| 300 |
for await (const scrapped of it) {
|
| 301 |
lastScrapped = scrapped;
|
| 302 |
-
|
|
|
|
|
|
|
| 303 |
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 304 |
setEarlyReturnTimer();
|
| 305 |
}
|
|
@@ -367,8 +368,8 @@ export class SearcherHost extends RPCHost {
|
|
| 367 |
const dataItems = [
|
| 368 |
{ key: 'title', label: 'Title' },
|
| 369 |
{ key: 'url', label: 'URL Source' },
|
| 370 |
-
{ key: 'description', label: 'Description'},
|
| 371 |
-
]
|
| 372 |
|
| 373 |
if (withContent) {
|
| 374 |
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
|
@@ -386,7 +387,7 @@ export class SearcherHost extends RPCHost {
|
|
| 386 |
result.toString = function () {
|
| 387 |
const self = this as any;
|
| 388 |
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
|
| 389 |
-
}
|
| 390 |
return result;
|
| 391 |
}));
|
| 392 |
|
|
@@ -408,7 +409,6 @@ export class SearcherHost extends RPCHost {
|
|
| 408 |
if (!searchResults) {
|
| 409 |
return;
|
| 410 |
}
|
| 411 |
-
|
| 412 |
const urls = searchResults.map((x) => new URL(x.link));
|
| 413 |
const snapshotMap = new WeakMap();
|
| 414 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
|
@@ -427,7 +427,7 @@ export class SearcherHost extends RPCHost {
|
|
| 427 |
if (snapshotMap.has(x)) {
|
| 428 |
return snapshotMap.get(x);
|
| 429 |
}
|
| 430 |
-
return this.
|
| 431 |
r.title ??= upstreamSearchResult.title;
|
| 432 |
r.description = upstreamSearchResult.snippet;
|
| 433 |
snapshotMap.set(x, r);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import { singleton } from 'tsyringe';
|
| 2 |
+
import {
|
| 3 |
+
assignTransferProtocolMeta, RPCHost, RPCReflection, AssertionFailureError, assignMeta, RawString,
|
| 4 |
+
} from 'civkit/civ-rpc';
|
| 5 |
+
import { marshalErrorLike } from 'civkit/lang';
|
| 6 |
+
import { objHashMd5B64Of } from 'civkit/hash';
|
| 7 |
import _ from 'lodash';
|
| 8 |
+
|
| 9 |
+
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
+
|
| 11 |
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
| 12 |
import { SerperSearchResult } from '../db/searched';
|
| 13 |
+
import { CrawlerOptions } from '../dto/crawler-options';
|
| 14 |
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
| 15 |
import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
|
| 16 |
+
|
| 17 |
+
import { GlobalLogger } from '../services/logger';
|
| 18 |
+
import { AsyncLocalContext } from '../services/async-context';
|
| 19 |
+
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
| 20 |
+
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
| 21 |
+
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 22 |
+
import { InsufficientBalanceError } from '../services/errors';
|
| 23 |
import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
| 24 |
|
| 25 |
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
|
|
|
|
| 37 |
targetResultCount = 5;
|
| 38 |
|
| 39 |
constructor(
|
| 40 |
+
protected globalLogger: GlobalLogger,
|
| 41 |
protected rateLimitControl: RateLimitControl,
|
| 42 |
+
protected threadLocal: AsyncLocalContext,
|
| 43 |
protected serperSearchService: SerperSearchService,
|
| 44 |
protected crawler: CrawlerHost,
|
| 45 |
protected snapshotFormatter: SnapshotFormatter,
|
|
|
|
| 53 |
this.emit('ready');
|
| 54 |
}
|
| 55 |
|
| 56 |
+
@Method({
|
| 57 |
+
name: 'searchIndex',
|
| 58 |
+
ext: {
|
| 59 |
+
http: {
|
| 60 |
+
action: ['get', 'post'],
|
| 61 |
+
path: '/search'
|
| 62 |
+
}
|
| 63 |
},
|
| 64 |
+
tags: ['search'],
|
|
|
|
| 65 |
returnType: [String, OutputServerEventStream],
|
|
|
|
| 66 |
})
|
| 67 |
+
@Method({
|
| 68 |
+
ext: {
|
| 69 |
+
http: {
|
| 70 |
+
action: ['get', 'post'],
|
| 71 |
+
path: '::q'
|
| 72 |
+
}
|
|
|
|
|
|
|
| 73 |
},
|
| 74 |
+
tags: ['search'],
|
| 75 |
+
returnType: [String, OutputServerEventStream, RawString],
|
|
|
|
|
|
|
| 76 |
})
|
| 77 |
async search(
|
| 78 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 79 |
+
@Ctx() ctx: Context,
|
|
|
|
|
|
|
|
|
|
| 80 |
auth: JinaEmbeddingsAuthDTO,
|
| 81 |
crawlerOptions: CrawlerOptions,
|
| 82 |
searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
|
|
|
|
| 97 |
|
| 98 |
const uid = await auth.solveUID();
|
| 99 |
// Return content by default
|
| 100 |
+
const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
|
| 101 |
+
const withFavicon = Boolean(ctx.get('X-With-Favicons'));
|
|
|
|
| 102 |
|
| 103 |
let chargeAmount = 0;
|
| 104 |
+
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
|
| 105 |
if (!noSlashPath && !q) {
|
| 106 |
+
const index = await this.crawler.getIndex(auth);
|
|
|
|
| 107 |
if (!uid) {
|
| 108 |
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
| 109 |
}
|
| 110 |
+
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 111 |
|
| 112 |
return index;
|
| 113 |
}
|
|
|
|
| 182 |
chargeAmount = 10000;
|
| 183 |
}
|
| 184 |
this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
|
| 185 |
+
if ((!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) || count === 0) {
|
| 186 |
return lastScrapped;
|
| 187 |
}
|
| 188 |
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
|
|
|
| 194 |
withFavicon
|
| 195 |
);
|
| 196 |
|
| 197 |
+
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
| 198 |
const sseStream = new OutputServerEventStream();
|
| 199 |
rpcReflect.return(sseStream);
|
| 200 |
|
|
|
|
| 203 |
if (!scrapped) {
|
| 204 |
continue;
|
| 205 |
}
|
| 206 |
+
if (rpcReflect.signal.aborted) {
|
| 207 |
+
break;
|
| 208 |
+
}
|
| 209 |
|
| 210 |
chargeAmount = this.assignChargeAmount(scrapped);
|
| 211 |
sseStream.write({
|
|
|
|
| 229 |
}
|
| 230 |
|
| 231 |
let earlyReturn = false;
|
| 232 |
+
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 233 |
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
| 234 |
const setEarlyReturnTimer = () => {
|
| 235 |
if (earlyReturnTimer) {
|
|
|
|
| 247 |
|
| 248 |
for await (const scrapped of it) {
|
| 249 |
lastScrapped = scrapped;
|
| 250 |
+
if (rpcReflect.signal.aborted) {
|
| 251 |
+
break;
|
| 252 |
+
}
|
| 253 |
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 254 |
setEarlyReturnTimer();
|
| 255 |
}
|
|
|
|
| 298 |
|
| 299 |
for await (const scrapped of it) {
|
| 300 |
lastScrapped = scrapped;
|
| 301 |
+
if (rpcReflect.signal.aborted) {
|
| 302 |
+
break;
|
| 303 |
+
}
|
| 304 |
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 305 |
setEarlyReturnTimer();
|
| 306 |
}
|
|
|
|
| 368 |
const dataItems = [
|
| 369 |
{ key: 'title', label: 'Title' },
|
| 370 |
{ key: 'url', label: 'URL Source' },
|
| 371 |
+
{ key: 'description', label: 'Description' },
|
| 372 |
+
];
|
| 373 |
|
| 374 |
if (withContent) {
|
| 375 |
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
|
|
|
| 387 |
result.toString = function () {
|
| 388 |
const self = this as any;
|
| 389 |
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
|
| 390 |
+
};
|
| 391 |
return result;
|
| 392 |
}));
|
| 393 |
|
|
|
|
| 409 |
if (!searchResults) {
|
| 410 |
return;
|
| 411 |
}
|
|
|
|
| 412 |
const urls = searchResults.map((x) => new URL(x.link));
|
| 413 |
const snapshotMap = new WeakMap();
|
| 414 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
|
|
|
| 427 |
if (snapshotMap.has(x)) {
|
| 428 |
return snapshotMap.get(x);
|
| 429 |
}
|
| 430 |
+
return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => {
|
| 431 |
r.title ??= upstreamSearchResult.title;
|
| 432 |
r.description = upstreamSearchResult.snippet;
|
| 433 |
snapshotMap.set(x, r);
|
|
@@ -1,22 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import {
|
| 2 |
-
assignTransferProtocolMeta,
|
| 3 |
-
RPCHost, RPCReflection,
|
| 4 |
AssertionFailureError,
|
| 5 |
-
|
| 6 |
-
} from 'civkit';
|
| 7 |
-
import {
|
| 8 |
-
import {
|
|
|
|
| 9 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 10 |
-
import
|
| 11 |
-
import { Request, Response } from 'express';
|
| 12 |
-
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
| 13 |
-
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
| 14 |
-
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
| 15 |
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
|
|
|
|
|
|
| 16 |
import { SearchResult } from '../db/searched';
|
| 17 |
-
import {
|
| 18 |
-
import { CrawlerOptions } from '../dto/
|
|
|
|
|
|
|
| 19 |
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
@singleton()
|
|
@@ -32,9 +40,9 @@ export class SearcherHost extends RPCHost {
|
|
| 32 |
targetResultCount = 5;
|
| 33 |
|
| 34 |
constructor(
|
| 35 |
-
protected globalLogger:
|
| 36 |
protected rateLimitControl: RateLimitControl,
|
| 37 |
-
protected threadLocal:
|
| 38 |
protected braveSearchService: BraveSearchService,
|
| 39 |
protected crawler: CrawlerHost,
|
| 40 |
protected snapshotFormatter: SnapshotFormatter,
|
|
@@ -48,39 +56,30 @@ export class SearcherHost extends RPCHost {
|
|
| 48 |
this.emit('ready');
|
| 49 |
}
|
| 50 |
|
| 51 |
-
@
|
| 52 |
-
name: '
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
},
|
| 59 |
-
tags: ['
|
| 60 |
-
httpMethod: ['get', 'post'],
|
| 61 |
returnType: [String, OutputServerEventStream],
|
| 62 |
-
exposeRoot: true,
|
| 63 |
})
|
| 64 |
-
@
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
maxInstances: 200,
|
| 71 |
-
minInstances: 1,
|
| 72 |
},
|
| 73 |
-
tags: ['
|
| 74 |
-
|
| 75 |
-
returnType: [String, OutputServerEventStream],
|
| 76 |
-
exposeRoot: true,
|
| 77 |
})
|
| 78 |
async search(
|
| 79 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 80 |
-
@Ctx() ctx:
|
| 81 |
-
req: Request,
|
| 82 |
-
res: Response,
|
| 83 |
-
},
|
| 84 |
auth: JinaEmbeddingsAuthDTO,
|
| 85 |
@Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
|
| 86 |
count: number,
|
|
@@ -90,14 +89,13 @@ export class SearcherHost extends RPCHost {
|
|
| 90 |
) {
|
| 91 |
const uid = await auth.solveUID();
|
| 92 |
let chargeAmount = 0;
|
| 93 |
-
const noSlashPath = decodeURIComponent(ctx.
|
| 94 |
if (!noSlashPath && !q) {
|
| 95 |
-
const
|
| 96 |
-
const index = this.crawler.getIndex(latestUser);
|
| 97 |
if (!uid) {
|
| 98 |
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
| 99 |
}
|
| 100 |
-
if (!ctx.
|
| 101 |
|
| 102 |
return index;
|
| 103 |
}
|
|
@@ -160,7 +158,7 @@ export class SearcherHost extends RPCHost {
|
|
| 160 |
count,
|
| 161 |
);
|
| 162 |
|
| 163 |
-
if (!ctx.
|
| 164 |
const sseStream = new OutputServerEventStream();
|
| 165 |
rpcReflect.return(sseStream);
|
| 166 |
|
|
@@ -193,7 +191,7 @@ export class SearcherHost extends RPCHost {
|
|
| 193 |
|
| 194 |
let lastScrapped: any[] | undefined;
|
| 195 |
let earlyReturn = false;
|
| 196 |
-
if (!ctx.
|
| 197 |
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
| 198 |
const setEarlyReturnTimer = () => {
|
| 199 |
if (earlyReturnTimer) {
|
|
|
|
| 1 |
+
import { singleton } from 'tsyringe';
|
| 2 |
+
import _ from 'lodash';
|
| 3 |
+
|
| 4 |
import {
|
| 5 |
+
assignTransferProtocolMeta, RPCHost, RPCReflection,
|
|
|
|
| 6 |
AssertionFailureError,
|
| 7 |
+
RawString,
|
| 8 |
+
} from 'civkit/civ-rpc';
|
| 9 |
+
import { marshalErrorLike } from 'civkit/lang';
|
| 10 |
+
import { objHashMd5B64Of } from 'civkit/hash';
|
| 11 |
+
|
| 12 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 13 |
+
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 15 |
+
|
| 16 |
+
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
| 17 |
import { SearchResult } from '../db/searched';
|
| 18 |
+
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 19 |
+
import { CrawlerOptions } from '../dto/crawler-options';
|
| 20 |
+
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
| 21 |
+
|
| 22 |
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
| 23 |
+
import { GlobalLogger } from '../services/logger';
|
| 24 |
+
import { AsyncLocalContext } from '../services/async-context';
|
| 25 |
+
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
| 26 |
+
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
| 27 |
+
import { InsufficientBalanceError } from '../services/errors';
|
| 28 |
|
| 29 |
|
| 30 |
@singleton()
|
|
|
|
| 40 |
targetResultCount = 5;
|
| 41 |
|
| 42 |
constructor(
|
| 43 |
+
protected globalLogger: GlobalLogger,
|
| 44 |
protected rateLimitControl: RateLimitControl,
|
| 45 |
+
protected threadLocal: AsyncLocalContext,
|
| 46 |
protected braveSearchService: BraveSearchService,
|
| 47 |
protected crawler: CrawlerHost,
|
| 48 |
protected snapshotFormatter: SnapshotFormatter,
|
|
|
|
| 56 |
this.emit('ready');
|
| 57 |
}
|
| 58 |
|
| 59 |
+
@Method({
|
| 60 |
+
name: 'searchIndex',
|
| 61 |
+
ext: {
|
| 62 |
+
http: {
|
| 63 |
+
action: ['get', 'post'],
|
| 64 |
+
path: '/search'
|
| 65 |
+
}
|
| 66 |
},
|
| 67 |
+
tags: ['search'],
|
|
|
|
| 68 |
returnType: [String, OutputServerEventStream],
|
|
|
|
| 69 |
})
|
| 70 |
+
@Method({
|
| 71 |
+
ext: {
|
| 72 |
+
http: {
|
| 73 |
+
action: ['get', 'post'],
|
| 74 |
+
path: '::q'
|
| 75 |
+
}
|
|
|
|
|
|
|
| 76 |
},
|
| 77 |
+
tags: ['search'],
|
| 78 |
+
returnType: [String, OutputServerEventStream, RawString],
|
|
|
|
|
|
|
| 79 |
})
|
| 80 |
async search(
|
| 81 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 82 |
+
@Ctx() ctx: Context,
|
|
|
|
|
|
|
|
|
|
| 83 |
auth: JinaEmbeddingsAuthDTO,
|
| 84 |
@Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
|
| 85 |
count: number,
|
|
|
|
| 89 |
) {
|
| 90 |
const uid = await auth.solveUID();
|
| 91 |
let chargeAmount = 0;
|
| 92 |
+
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
|
| 93 |
if (!noSlashPath && !q) {
|
| 94 |
+
const index = await this.crawler.getIndex(auth);
|
|
|
|
| 95 |
if (!uid) {
|
| 96 |
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
| 97 |
}
|
| 98 |
+
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 99 |
|
| 100 |
return index;
|
| 101 |
}
|
|
|
|
| 158 |
count,
|
| 159 |
);
|
| 160 |
|
| 161 |
+
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
| 162 |
const sseStream = new OutputServerEventStream();
|
| 163 |
rpcReflect.return(sseStream);
|
| 164 |
|
|
|
|
| 191 |
|
| 192 |
let lastScrapped: any[] | undefined;
|
| 193 |
let earlyReturn = false;
|
| 194 |
+
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 195 |
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
| 196 |
const setEarlyReturnTimer = () => {
|
| 197 |
if (earlyReturnTimer) {
|
|
@@ -14,7 +14,7 @@ import robotsParser from 'robots-parser';
|
|
| 14 |
import { DOMParser } from '@xmldom/xmldom';
|
| 15 |
|
| 16 |
import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
|
| 17 |
-
import { CrawlerOptions } from '../dto/
|
| 18 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 19 |
import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
|
| 20 |
import { getFunctions } from 'firebase-admin/functions';
|
|
|
|
| 14 |
import { DOMParser } from '@xmldom/xmldom';
|
| 15 |
|
| 16 |
import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
|
| 17 |
+
import { CrawlerOptions } from '../dto/crawler-options';
|
| 18 |
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 19 |
import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
|
| 20 |
import { getFunctions } from 'firebase-admin/functions';
|
|
@@ -9,7 +9,7 @@ import {
|
|
| 9 |
FirebaseStorageBucketControl, Logger, Param, TempFileManager
|
| 10 |
} from '../shared';
|
| 11 |
import _ from 'lodash';
|
| 12 |
-
import { CrawlerHost } from './crawler';
|
| 13 |
|
| 14 |
import { Crawled } from '../db/crawled';
|
| 15 |
import dayjs from 'dayjs';
|
|
|
|
| 9 |
FirebaseStorageBucketControl, Logger, Param, TempFileManager
|
| 10 |
} from '../shared';
|
| 11 |
import _ from 'lodash';
|
| 12 |
+
import { CrawlerHost } from '../api/crawler';
|
| 13 |
|
| 14 |
import { Crawled } from '../db/crawled';
|
| 15 |
import dayjs from 'dayjs';
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { Also, Prop } from 'civkit';
|
| 2 |
import { FirestoreRecord } from '../shared/lib/firestore';
|
| 3 |
-
import { ENGINE_TYPE } from '../dto/
|
| 4 |
|
| 5 |
@Also({
|
| 6 |
dictOf: Object
|
|
|
|
| 1 |
import { Also, Prop } from 'civkit';
|
| 2 |
import { FirestoreRecord } from '../shared/lib/firestore';
|
| 3 |
+
import { ENGINE_TYPE } from '../dto/crawler-options';
|
| 4 |
|
| 5 |
@Also({
|
| 6 |
dictOf: Object
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
| 2 |
-
import type { Request, Response } from 'express';
|
| 3 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
|
|
| 4 |
|
| 5 |
export enum CONTENT_FORMAT {
|
| 6 |
CONTENT = 'content',
|
|
@@ -19,6 +19,7 @@ export enum ENGINE_TYPE {
|
|
| 19 |
DIRECT = 'direct',
|
| 20 |
VLM = 'vlm',
|
| 21 |
READER_LM = 'readerlm-v2',
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
|
@@ -125,6 +126,11 @@ class Viewport extends AutoCastable {
|
|
| 125 |
in: 'header',
|
| 126 |
schema: { type: 'string' }
|
| 127 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
'X-Set-Cookie': {
|
| 129 |
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
| 130 |
`Syntax is the same with standard Set-Cookie`,
|
|
@@ -297,6 +303,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 297 |
@Prop()
|
| 298 |
proxyUrl?: string;
|
| 299 |
|
|
|
|
|
|
|
|
|
|
| 300 |
@Prop()
|
| 301 |
userAgent?: string;
|
| 302 |
|
|
@@ -338,15 +347,18 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 338 |
@Prop()
|
| 339 |
jsonSchema?: object;
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
static override from(input: any) {
|
| 342 |
const instance = super.from(input) as CrawlerOptions;
|
| 343 |
-
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as
|
| 344 |
-
req: Request,
|
| 345 |
-
res: Response,
|
| 346 |
-
} | undefined;
|
| 347 |
|
| 348 |
-
const customMode = ctx?.
|
| 349 |
-
if (customMode
|
| 350 |
instance.respondWith = customMode;
|
| 351 |
}
|
| 352 |
if (instance.respondWith) {
|
|
@@ -361,74 +373,74 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 361 |
}
|
| 362 |
}
|
| 363 |
|
| 364 |
-
const locale = ctx?.
|
| 365 |
-
if (locale
|
| 366 |
instance.locale = locale;
|
| 367 |
}
|
| 368 |
|
| 369 |
-
const referer = ctx?.
|
| 370 |
-
if (referer
|
| 371 |
instance.referer = referer;
|
| 372 |
}
|
| 373 |
|
| 374 |
-
const withGeneratedAlt = ctx?.
|
| 375 |
-
if (withGeneratedAlt
|
| 376 |
instance.withGeneratedAlt = Boolean(withGeneratedAlt);
|
| 377 |
}
|
| 378 |
-
const withLinksSummary = ctx?.
|
| 379 |
-
if (withLinksSummary
|
| 380 |
if (withLinksSummary === 'all') {
|
| 381 |
instance.withLinksSummary = withLinksSummary;
|
| 382 |
} else {
|
| 383 |
instance.withLinksSummary = Boolean(withLinksSummary);
|
| 384 |
}
|
| 385 |
}
|
| 386 |
-
const withImagesSummary = ctx?.
|
| 387 |
-
if (withImagesSummary
|
| 388 |
instance.withImagesSummary = Boolean(withImagesSummary);
|
| 389 |
}
|
| 390 |
-
const retainImages = ctx?.
|
| 391 |
if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
|
| 392 |
instance.retainImages = retainImages as any;
|
| 393 |
}
|
| 394 |
if (instance.withGeneratedAlt) {
|
| 395 |
instance.retainImages = 'all_p';
|
| 396 |
}
|
| 397 |
-
const noCache = ctx?.
|
| 398 |
-
if (noCache
|
| 399 |
instance.noCache = Boolean(noCache);
|
| 400 |
}
|
| 401 |
if (instance.noCache && instance.cacheTolerance === undefined) {
|
| 402 |
instance.cacheTolerance = 0;
|
| 403 |
}
|
| 404 |
-
let cacheTolerance = parseInt(ctx?.
|
| 405 |
if (!isNaN(cacheTolerance)) {
|
| 406 |
instance.cacheTolerance = cacheTolerance;
|
| 407 |
}
|
| 408 |
|
| 409 |
-
const noGfm = ctx?.
|
| 410 |
if (noGfm) {
|
| 411 |
instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
|
| 412 |
}
|
| 413 |
|
| 414 |
-
let timeoutSeconds = parseInt(ctx?.
|
| 415 |
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
|
| 416 |
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
|
| 417 |
-
} else if (ctx?.
|
| 418 |
instance.timeout = null;
|
| 419 |
}
|
| 420 |
|
| 421 |
-
const removeSelector = ctx?.
|
| 422 |
-
instance.removeSelector ??= removeSelector;
|
| 423 |
-
const targetSelector = ctx?.
|
| 424 |
-
instance.targetSelector ??= targetSelector;
|
| 425 |
-
const waitForSelector = ctx?.
|
| 426 |
-
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
| 427 |
instance.targetSelector = filterSelector(instance.targetSelector);
|
| 428 |
-
const overrideUserAgent = ctx?.
|
| 429 |
instance.userAgent ??= overrideUserAgent;
|
| 430 |
|
| 431 |
-
const engine = ctx?.
|
| 432 |
if (engine) {
|
| 433 |
instance.engine = engine;
|
| 434 |
}
|
|
@@ -443,18 +455,18 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 443 |
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
| 444 |
}
|
| 445 |
|
| 446 |
-
const keepImgDataUrl = ctx?.
|
| 447 |
-
if (keepImgDataUrl
|
| 448 |
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
| 449 |
}
|
| 450 |
-
const withIframe = ctx?.
|
| 451 |
-
if (withIframe
|
| 452 |
instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
|
| 453 |
}
|
| 454 |
if (instance.withIframe) {
|
| 455 |
instance.timeout ??= null;
|
| 456 |
}
|
| 457 |
-
const withShadowDom = ctx?.
|
| 458 |
if (withShadowDom) {
|
| 459 |
instance.withShadowDom = Boolean(withShadowDom);
|
| 460 |
}
|
|
@@ -463,7 +475,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 463 |
}
|
| 464 |
|
| 465 |
const cookies: Cookie[] = [];
|
| 466 |
-
const setCookieHeaders = ctx?.
|
| 467 |
if (Array.isArray(setCookieHeaders)) {
|
| 468 |
for (const setCookie of setCookieHeaders) {
|
| 469 |
cookies.push({
|
|
@@ -477,21 +489,24 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 477 |
}
|
| 478 |
instance.setCookies = cookies;
|
| 479 |
|
| 480 |
-
const proxyUrl = ctx?.
|
| 481 |
-
instance.proxyUrl ??= proxyUrl;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
-
|
| 484 |
-
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 485 |
-
}
|
| 486 |
-
|
| 487 |
-
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
|
| 488 |
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
| 489 |
|
| 490 |
-
const baseMode = ctx?.
|
| 491 |
if (baseMode) {
|
| 492 |
instance.base = baseMode as any;
|
| 493 |
}
|
| 494 |
|
|
|
|
|
|
|
|
|
|
| 495 |
if (instance.cacheTolerance) {
|
| 496 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 497 |
}
|
|
|
|
| 1 |
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
|
|
|
| 2 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 3 |
+
import { Context } from '../services/registry';
|
| 4 |
|
| 5 |
export enum CONTENT_FORMAT {
|
| 6 |
CONTENT = 'content',
|
|
|
|
| 19 |
DIRECT = 'direct',
|
| 20 |
VLM = 'vlm',
|
| 21 |
READER_LM = 'readerlm-v2',
|
| 22 |
+
CF_BROWSER_RENDERING = 'cf-browser-rendering',
|
| 23 |
}
|
| 24 |
|
| 25 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
|
|
|
| 126 |
in: 'header',
|
| 127 |
schema: { type: 'string' }
|
| 128 |
},
|
| 129 |
+
'X-Proxy': {
|
| 130 |
+
description: `Use a proxy server provided by Jina AI.\n\nOptionally specify two-letter country code.`,
|
| 131 |
+
in: 'header',
|
| 132 |
+
schema: { type: 'string' }
|
| 133 |
+
},
|
| 134 |
'X-Set-Cookie': {
|
| 135 |
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
| 136 |
`Syntax is the same with standard Set-Cookie`,
|
|
|
|
| 303 |
@Prop()
|
| 304 |
proxyUrl?: string;
|
| 305 |
|
| 306 |
+
@Prop()
|
| 307 |
+
proxy?: string;
|
| 308 |
+
|
| 309 |
@Prop()
|
| 310 |
userAgent?: string;
|
| 311 |
|
|
|
|
| 347 |
@Prop()
|
| 348 |
jsonSchema?: object;
|
| 349 |
|
| 350 |
+
@Prop()
|
| 351 |
+
robotsTxt?: string;
|
| 352 |
+
|
| 353 |
+
@Prop()
|
| 354 |
+
doNotTrack?: number | null;
|
| 355 |
+
|
| 356 |
static override from(input: any) {
|
| 357 |
const instance = super.from(input) as CrawlerOptions;
|
| 358 |
+
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
+
const customMode = ctx?.get('x-respond-with') || ctx?.get('x-return-format');
|
| 361 |
+
if (customMode) {
|
| 362 |
instance.respondWith = customMode;
|
| 363 |
}
|
| 364 |
if (instance.respondWith) {
|
|
|
|
| 373 |
}
|
| 374 |
}
|
| 375 |
|
| 376 |
+
const locale = ctx?.get('x-locale');
|
| 377 |
+
if (locale) {
|
| 378 |
instance.locale = locale;
|
| 379 |
}
|
| 380 |
|
| 381 |
+
const referer = ctx?.get('x-referer');
|
| 382 |
+
if (referer) {
|
| 383 |
instance.referer = referer;
|
| 384 |
}
|
| 385 |
|
| 386 |
+
const withGeneratedAlt = ctx?.get('x-with-generated-alt');
|
| 387 |
+
if (withGeneratedAlt) {
|
| 388 |
instance.withGeneratedAlt = Boolean(withGeneratedAlt);
|
| 389 |
}
|
| 390 |
+
const withLinksSummary = ctx?.get('x-with-links-summary');
|
| 391 |
+
if (withLinksSummary) {
|
| 392 |
if (withLinksSummary === 'all') {
|
| 393 |
instance.withLinksSummary = withLinksSummary;
|
| 394 |
} else {
|
| 395 |
instance.withLinksSummary = Boolean(withLinksSummary);
|
| 396 |
}
|
| 397 |
}
|
| 398 |
+
const withImagesSummary = ctx?.get('x-with-images-summary');
|
| 399 |
+
if (withImagesSummary) {
|
| 400 |
instance.withImagesSummary = Boolean(withImagesSummary);
|
| 401 |
}
|
| 402 |
+
const retainImages = ctx?.get('x-retain-images');
|
| 403 |
if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
|
| 404 |
instance.retainImages = retainImages as any;
|
| 405 |
}
|
| 406 |
if (instance.withGeneratedAlt) {
|
| 407 |
instance.retainImages = 'all_p';
|
| 408 |
}
|
| 409 |
+
const noCache = ctx?.get('x-no-cache');
|
| 410 |
+
if (noCache) {
|
| 411 |
instance.noCache = Boolean(noCache);
|
| 412 |
}
|
| 413 |
if (instance.noCache && instance.cacheTolerance === undefined) {
|
| 414 |
instance.cacheTolerance = 0;
|
| 415 |
}
|
| 416 |
+
let cacheTolerance = parseInt(ctx?.get('x-cache-tolerance') || '');
|
| 417 |
if (!isNaN(cacheTolerance)) {
|
| 418 |
instance.cacheTolerance = cacheTolerance;
|
| 419 |
}
|
| 420 |
|
| 421 |
+
const noGfm = ctx?.get('x-no-gfm');
|
| 422 |
if (noGfm) {
|
| 423 |
instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
|
| 424 |
}
|
| 425 |
|
| 426 |
+
let timeoutSeconds = parseInt(ctx?.get('x-timeout') || '');
|
| 427 |
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
|
| 428 |
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
|
| 429 |
+
} else if (ctx?.get('x-timeout')) {
|
| 430 |
instance.timeout = null;
|
| 431 |
}
|
| 432 |
|
| 433 |
+
const removeSelector = ctx?.get('x-remove-selector')?.split(', ').filter(Boolean);
|
| 434 |
+
instance.removeSelector ??= removeSelector?.length ? removeSelector : undefined;
|
| 435 |
+
const targetSelector = ctx?.get('x-target-selector')?.split(', ').filter(Boolean);
|
| 436 |
+
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
|
| 437 |
+
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
|
| 438 |
+
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
|
| 439 |
instance.targetSelector = filterSelector(instance.targetSelector);
|
| 440 |
+
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
|
| 441 |
instance.userAgent ??= overrideUserAgent;
|
| 442 |
|
| 443 |
+
const engine = ctx?.get('x-engine');
|
| 444 |
if (engine) {
|
| 445 |
instance.engine = engine;
|
| 446 |
}
|
|
|
|
| 455 |
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
| 456 |
}
|
| 457 |
|
| 458 |
+
const keepImgDataUrl = ctx?.get('x-keep-img-data-url');
|
| 459 |
+
if (keepImgDataUrl) {
|
| 460 |
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
| 461 |
}
|
| 462 |
+
const withIframe = ctx?.get('x-with-iframe');
|
| 463 |
+
if (withIframe) {
|
| 464 |
instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
|
| 465 |
}
|
| 466 |
if (instance.withIframe) {
|
| 467 |
instance.timeout ??= null;
|
| 468 |
}
|
| 469 |
+
const withShadowDom = ctx?.get('x-with-shadow-dom');
|
| 470 |
if (withShadowDom) {
|
| 471 |
instance.withShadowDom = Boolean(withShadowDom);
|
| 472 |
}
|
|
|
|
| 475 |
}
|
| 476 |
|
| 477 |
const cookies: Cookie[] = [];
|
| 478 |
+
const setCookieHeaders = (ctx?.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[])).filter(Boolean);
|
| 479 |
if (Array.isArray(setCookieHeaders)) {
|
| 480 |
for (const setCookie of setCookieHeaders) {
|
| 481 |
cookies.push({
|
|
|
|
| 489 |
}
|
| 490 |
instance.setCookies = cookies;
|
| 491 |
|
| 492 |
+
const proxyUrl = ctx?.get('x-proxy-url');
|
| 493 |
+
instance.proxyUrl ??= proxyUrl || undefined;
|
| 494 |
+
const proxy = ctx?.get('x-proxy');
|
| 495 |
+
instance.proxy ??= proxy || undefined;
|
| 496 |
+
const robotsTxt = ctx?.get('x-robots-txt');
|
| 497 |
+
instance.robotsTxt ??= robotsTxt || undefined;
|
| 498 |
|
| 499 |
+
const tokenBudget = ctx?.get('x-token-budget');
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
| 501 |
|
| 502 |
+
const baseMode = ctx?.get('x-base');
|
| 503 |
if (baseMode) {
|
| 504 |
instance.base = baseMode as any;
|
| 505 |
}
|
| 506 |
|
| 507 |
+
const dnt = ctx?.get('dnt');
|
| 508 |
+
instance.doNotTrack ??= (parseInt(dnt || '') || null);
|
| 509 |
+
|
| 510 |
if (instance.cacheTolerance) {
|
| 511 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 512 |
}
|
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import _ from 'lodash';
|
| 2 |
+
import {
|
| 3 |
+
Also, AuthenticationFailedError, AuthenticationRequiredError,
|
| 4 |
+
DownstreamServiceFailureError, RPC_CALL_ENVIRONMENT,
|
| 5 |
+
AutoCastable,
|
| 6 |
+
} from 'civkit/civ-rpc';
|
| 7 |
+
import { htmlEscape } from 'civkit/escape';
|
| 8 |
+
import { marshalErrorLike } from 'civkit/lang';
|
| 9 |
+
|
| 10 |
+
import type { Context } from 'koa';
|
| 11 |
+
|
| 12 |
+
import logger from '../services/logger';
|
| 13 |
+
import { InjectProperty } from '../services/registry';
|
| 14 |
+
import { AsyncLocalContext } from '../services/async-context';
|
| 15 |
+
|
| 16 |
+
import envConfig from '../shared/services/secrets';
|
| 17 |
+
import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings';
|
| 18 |
+
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
const authDtoLogger = logger.child({ service: 'JinaAuthDTO' });
|
| 22 |
+
|
| 23 |
+
const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY);
|
| 24 |
+
|
| 25 |
+
@Also({
|
| 26 |
+
openapi: {
|
| 27 |
+
operation: {
|
| 28 |
+
parameters: {
|
| 29 |
+
'Authorization': {
|
| 30 |
+
description: htmlEscape`Jina Token for authentication.\n\n` +
|
| 31 |
+
htmlEscape`- Member of <JinaEmbeddingsAuthDTO>\n\n` +
|
| 32 |
+
`- Authorization: Bearer {YOUR_JINA_TOKEN}`
|
| 33 |
+
,
|
| 34 |
+
in: 'header',
|
| 35 |
+
schema: {
|
| 36 |
+
anyOf: [
|
| 37 |
+
{ type: 'string', format: 'token' }
|
| 38 |
+
]
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
})
|
| 45 |
+
export class JinaEmbeddingsAuthDTO extends AutoCastable {
|
| 46 |
+
uid?: string;
|
| 47 |
+
bearerToken?: string;
|
| 48 |
+
user?: JinaEmbeddingsTokenAccount;
|
| 49 |
+
|
| 50 |
+
@InjectProperty(AsyncLocalContext)
|
| 51 |
+
ctxMgr!: AsyncLocalContext;
|
| 52 |
+
|
| 53 |
+
jinaEmbeddingsDashboard = THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT;
|
| 54 |
+
|
| 55 |
+
static override from(input: any) {
|
| 56 |
+
const instance = super.from(input) as JinaEmbeddingsAuthDTO;
|
| 57 |
+
|
| 58 |
+
const ctx = input[RPC_CALL_ENVIRONMENT] as Context;
|
| 59 |
+
|
| 60 |
+
if (ctx) {
|
| 61 |
+
const authorization = ctx.get('authorization');
|
| 62 |
+
|
| 63 |
+
if (authorization) {
|
| 64 |
+
const authToken = authorization.split(' ')[1] || authorization;
|
| 65 |
+
instance.bearerToken = authToken;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
if (!instance.bearerToken && input._token) {
|
| 71 |
+
instance.bearerToken = input._token;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
return instance;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
async getBrief(ignoreCache?: boolean | string) {
|
| 78 |
+
if (!this.bearerToken) {
|
| 79 |
+
throw new AuthenticationRequiredError({
|
| 80 |
+
message: 'Jina API key is required to authenticate. Please get one from https://jina.ai'
|
| 81 |
+
});
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
let account;
|
| 85 |
+
try {
|
| 86 |
+
account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
|
| 87 |
+
} catch (err) {
|
| 88 |
+
// FireStore would not accept any string as input and may throw if not happy with it
|
| 89 |
+
void 0;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
const age = account?.lastSyncedAt ? Date.now() - account.lastSyncedAt.getTime() : Infinity;
|
| 94 |
+
|
| 95 |
+
if (account && !ignoreCache) {
|
| 96 |
+
if (account && age < 180_000) {
|
| 97 |
+
this.user = account;
|
| 98 |
+
this.uid = this.user?.user_id;
|
| 99 |
+
|
| 100 |
+
return account;
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
try {
|
| 105 |
+
const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
|
| 106 |
+
const brief = r.data;
|
| 107 |
+
const draftAccount = JinaEmbeddingsTokenAccount.from({
|
| 108 |
+
...account, ...brief, _id: this.bearerToken,
|
| 109 |
+
lastSyncedAt: new Date()
|
| 110 |
+
});
|
| 111 |
+
await JinaEmbeddingsTokenAccount.save(draftAccount.degradeForFireStore(), undefined, { merge: true });
|
| 112 |
+
|
| 113 |
+
this.user = draftAccount;
|
| 114 |
+
this.uid = this.user?.user_id;
|
| 115 |
+
|
| 116 |
+
return draftAccount;
|
| 117 |
+
} catch (err: any) {
|
| 118 |
+
authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) });
|
| 119 |
+
|
| 120 |
+
if (err?.status === 401) {
|
| 121 |
+
throw new AuthenticationFailedError({
|
| 122 |
+
message: 'Invalid API key, please get a new one from https://jina.ai'
|
| 123 |
+
});
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
if (account) {
|
| 127 |
+
this.user = account;
|
| 128 |
+
this.uid = this.user?.user_id;
|
| 129 |
+
|
| 130 |
+
return account;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
throw new DownstreamServiceFailureError(`Failed to authenticate: ${err}`);
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
async reportUsage(tokenCount: number, mdl: string, endpoint: string = '/encode') {
|
| 139 |
+
const user = await this.assertUser();
|
| 140 |
+
const uid = user.user_id;
|
| 141 |
+
user.wallet.total_balance -= tokenCount;
|
| 142 |
+
|
| 143 |
+
return this.jinaEmbeddingsDashboard.reportUsage(this.bearerToken!, {
|
| 144 |
+
model_name: mdl,
|
| 145 |
+
api_endpoint: endpoint,
|
| 146 |
+
consumer: {
|
| 147 |
+
id: uid,
|
| 148 |
+
user_id: uid,
|
| 149 |
+
},
|
| 150 |
+
usage: {
|
| 151 |
+
total_tokens: tokenCount
|
| 152 |
+
},
|
| 153 |
+
labels: {
|
| 154 |
+
model_name: mdl
|
| 155 |
+
}
|
| 156 |
+
}).then((r) => {
|
| 157 |
+
JinaEmbeddingsTokenAccount.COLLECTION.doc(this.bearerToken!)
|
| 158 |
+
.update({ 'wallet.total_balance': JinaEmbeddingsTokenAccount.OPS.increment(-tokenCount) })
|
| 159 |
+
.catch((err) => {
|
| 160 |
+
authDtoLogger.warn(`Failed to update cache for ${uid}: ${err}`, { err: marshalErrorLike(err) });
|
| 161 |
+
});
|
| 162 |
+
|
| 163 |
+
return r;
|
| 164 |
+
}).catch((err) => {
|
| 165 |
+
user.wallet.total_balance += tokenCount;
|
| 166 |
+
authDtoLogger.warn(`Failed to report usage for ${uid}: ${err}`, { err: marshalErrorLike(err) });
|
| 167 |
+
});
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
async solveUID() {
|
| 171 |
+
if (this.uid) {
|
| 172 |
+
this.ctxMgr.set('uid', this.uid);
|
| 173 |
+
|
| 174 |
+
return this.uid;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
if (this.bearerToken) {
|
| 178 |
+
await this.getBrief();
|
| 179 |
+
this.ctxMgr.set('uid', this.uid);
|
| 180 |
+
|
| 181 |
+
return this.uid;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
return undefined;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
async assertUID() {
|
| 188 |
+
const uid = await this.solveUID();
|
| 189 |
+
|
| 190 |
+
if (!uid) {
|
| 191 |
+
throw new AuthenticationRequiredError('Authentication failed');
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
return uid;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
async assertUser() {
|
| 198 |
+
if (this.user) {
|
| 199 |
+
return this.user;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
await this.getBrief();
|
| 203 |
+
|
| 204 |
+
return this.user!;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
getRateLimits(...tags: string[]) {
|
| 208 |
+
const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective());
|
| 209 |
+
|
| 210 |
+
if (descs.length) {
|
| 211 |
+
return descs;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
return undefined;
|
| 215 |
+
}
|
| 216 |
+
}
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { TPM, parseJSONText } from 'civkit';
|
| 2 |
+
import { Transform, TransformCallback, TransformOptions } from 'stream';
|
| 3 |
+
|
| 4 |
+
export class InputServerEventStream extends Transform {
|
| 5 |
+
cache: string[] = [];
|
| 6 |
+
|
| 7 |
+
constructor(options?: TransformOptions) {
|
| 8 |
+
super({
|
| 9 |
+
...options,
|
| 10 |
+
readableObjectMode: true
|
| 11 |
+
});
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
decodeRoutine() {
|
| 15 |
+
if (!this.cache.length) {
|
| 16 |
+
return;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
const vecs = this.cache.join('').split(/\r?\n\r?\n/);
|
| 20 |
+
this.cache.length = 0;
|
| 21 |
+
const lastVec = vecs.pop();
|
| 22 |
+
if (lastVec) {
|
| 23 |
+
this.cache.push(lastVec);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
for (const x of vecs) {
|
| 27 |
+
const lines: string[] = x.split(/\r?\n/);
|
| 28 |
+
|
| 29 |
+
const event: {
|
| 30 |
+
event?: string;
|
| 31 |
+
data?: string;
|
| 32 |
+
id?: string;
|
| 33 |
+
retry?: number;
|
| 34 |
+
} = {};
|
| 35 |
+
|
| 36 |
+
for (const l of lines) {
|
| 37 |
+
const columnPos = l.indexOf(':');
|
| 38 |
+
if (columnPos <= 0) {
|
| 39 |
+
continue;
|
| 40 |
+
}
|
| 41 |
+
const key = l.substring(0, columnPos);
|
| 42 |
+
const rawValue = l.substring(columnPos + 1);
|
| 43 |
+
const value = rawValue.startsWith(' ') ? rawValue.slice(1) : rawValue;
|
| 44 |
+
if (key === 'data') {
|
| 45 |
+
if (event.data) {
|
| 46 |
+
event.data += value || '\n';
|
| 47 |
+
} else if (event.data === '') {
|
| 48 |
+
event.data += '\n';
|
| 49 |
+
event.data += value || '\n';
|
| 50 |
+
} else {
|
| 51 |
+
event.data = value;
|
| 52 |
+
}
|
| 53 |
+
} else if (key === 'retry') {
|
| 54 |
+
event.retry = parseInt(value, 10);
|
| 55 |
+
} else {
|
| 56 |
+
Reflect.set(event, key, value);
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
if (event.data) {
|
| 61 |
+
const parsed = parseJSONText(event.data);
|
| 62 |
+
if (parsed && typeof parsed === 'object') {
|
| 63 |
+
event.data = parsed;
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
if (Object.keys(event).length) {
|
| 68 |
+
this.push(event);
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
|
| 74 |
+
if (chunk === null) {
|
| 75 |
+
this.push(null);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
this.cache.push(chunk.toString());
|
| 79 |
+
this.decodeRoutine();
|
| 80 |
+
|
| 81 |
+
callback();
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
override _final(callback: (error?: Error | null | undefined) => void): void {
|
| 85 |
+
this.decodeRoutine();
|
| 86 |
+
callback();
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
@TPM({
|
| 91 |
+
contentType: 'text/event-stream',
|
| 92 |
+
})
|
| 93 |
+
export class OutputServerEventStream extends Transform {
|
| 94 |
+
n: number = 0;
|
| 95 |
+
|
| 96 |
+
constructor(options?: TransformOptions) {
|
| 97 |
+
super({
|
| 98 |
+
...options, writableObjectMode: true, encoding: 'utf-8'
|
| 99 |
+
});
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
encodeRoutine(chunk: {
|
| 103 |
+
event?: string;
|
| 104 |
+
data?: any;
|
| 105 |
+
id?: string;
|
| 106 |
+
retry?: number;
|
| 107 |
+
} | string) {
|
| 108 |
+
if (typeof chunk === 'object') {
|
| 109 |
+
const lines: string[] = [];
|
| 110 |
+
|
| 111 |
+
if (chunk.event) {
|
| 112 |
+
lines.push(`event: ${chunk.event}`);
|
| 113 |
+
}
|
| 114 |
+
if (chunk.data) {
|
| 115 |
+
if (typeof chunk.data === 'string') {
|
| 116 |
+
for (const x of chunk.data.split(/\r?\n/)) {
|
| 117 |
+
lines.push(`data: ${x}`);
|
| 118 |
+
}
|
| 119 |
+
} else {
|
| 120 |
+
lines.push(`data: ${JSON.stringify(chunk.data)}`);
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
if (chunk.id) {
|
| 124 |
+
lines.push(`id: ${chunk.id}`);
|
| 125 |
+
}
|
| 126 |
+
if (chunk.retry) {
|
| 127 |
+
lines.push(`retry: ${chunk.retry}`);
|
| 128 |
+
}
|
| 129 |
+
if (!lines.length) {
|
| 130 |
+
lines.push(`data: ${JSON.stringify(chunk)}`);
|
| 131 |
+
}
|
| 132 |
+
this.push(lines.join('\n'));
|
| 133 |
+
this.push('\n\n');
|
| 134 |
+
this.n++;
|
| 135 |
+
|
| 136 |
+
return;
|
| 137 |
+
} else if (typeof chunk === 'string') {
|
| 138 |
+
const lines: string[] = [];
|
| 139 |
+
for (const x of chunk.split(/\r?\n/)) {
|
| 140 |
+
lines.push(`data: ${x}`);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
this.push(lines.join('\n'));
|
| 144 |
+
this.push('\n\n');
|
| 145 |
+
this.n++;
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
|
| 150 |
+
if (chunk === null) {
|
| 151 |
+
this.push(null);
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
this.encodeRoutine(chunk);
|
| 155 |
+
|
| 156 |
+
callback();
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
export interface OutputServerEventStream extends Transform {
|
| 161 |
+
write(chunk: string | {
|
| 162 |
+
event?: string;
|
| 163 |
+
data?: any;
|
| 164 |
+
id?: string;
|
| 165 |
+
retry?: number;
|
| 166 |
+
}, callback?: (error: Error | null | undefined) => void): boolean;
|
| 167 |
+
write(chunk: any, callback?: (error: Error | null | undefined) => void): boolean;
|
| 168 |
+
write(chunk: any, encoding: BufferEncoding, callback?: (error: Error | null | undefined) => void): boolean;
|
| 169 |
+
}
|
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { GlobalAsyncContext } from 'civkit/async-context';
|
| 2 |
+
import { container, singleton } from 'tsyringe';
|
| 3 |
+
|
| 4 |
+
@singleton()
|
| 5 |
+
export class AsyncLocalContext extends GlobalAsyncContext { }
|
| 6 |
+
|
| 7 |
+
const instance = container.resolve(AsyncLocalContext);
|
| 8 |
+
Reflect.set(process, 'asyncLocalContext', instance);
|
| 9 |
+
|
| 10 |
+
export default instance;
|
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { singleton } from 'tsyringe';
|
| 2 |
+
import { AsyncService } from 'civkit/async-service';
|
| 3 |
+
import { GlobalLogger } from './logger';
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@singleton()
|
| 7 |
+
export class BlackHoleDetector extends AsyncService {
|
| 8 |
+
|
| 9 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 10 |
+
lastWorkedTs?: number;
|
| 11 |
+
lastDoneRequestTs?: number;
|
| 12 |
+
lastIncomingRequestTs?: number;
|
| 13 |
+
|
| 14 |
+
maxDelay = 1000 * 30;
|
| 15 |
+
concurrentRequests = 0;
|
| 16 |
+
|
| 17 |
+
strikes = 0;
|
| 18 |
+
|
| 19 |
+
constructor(protected globalLogger: GlobalLogger) {
|
| 20 |
+
super(...arguments);
|
| 21 |
+
|
| 22 |
+
if (process.env.NODE_ENV?.startsWith('prod')) {
|
| 23 |
+
setInterval(() => {
|
| 24 |
+
this.routine();
|
| 25 |
+
}, 1000 * 15).unref();
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
override async init() {
|
| 30 |
+
await this.dependencyReady();
|
| 31 |
+
this.logger.debug('BlackHoleDetector started');
|
| 32 |
+
this.emit('ready');
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
routine() {
|
| 36 |
+
const now = Date.now();
|
| 37 |
+
const lastWorked = this.lastWorkedTs;
|
| 38 |
+
if (!lastWorked) {
|
| 39 |
+
return;
|
| 40 |
+
}
|
| 41 |
+
const dt = (now - lastWorked);
|
| 42 |
+
if (this.concurrentRequests > 0 &&
|
| 43 |
+
this.lastIncomingRequestTs && lastWorked &&
|
| 44 |
+
this.lastIncomingRequestTs >= lastWorked &&
|
| 45 |
+
(dt > (this.maxDelay * (this.strikes + 1)))
|
| 46 |
+
) {
|
| 47 |
+
this.logger.warn(`BlackHole detected, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
|
| 48 |
+
this.strikes += 1;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
if (this.strikes >= 3) {
|
| 52 |
+
this.logger.error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
|
| 53 |
+
this.emit('error', new Error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`));
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
incomingRequest() {
|
| 58 |
+
this.lastIncomingRequestTs = Date.now();
|
| 59 |
+
this.lastWorkedTs ??= Date.now();
|
| 60 |
+
this.concurrentRequests++;
|
| 61 |
+
}
|
| 62 |
+
doneWithRequest() {
|
| 63 |
+
this.concurrentRequests--;
|
| 64 |
+
this.lastDoneRequestTs = Date.now();
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
itWorked() {
|
| 68 |
+
this.lastWorkedTs = Date.now();
|
| 69 |
+
this.strikes = 0;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
};
|
|
@@ -7,6 +7,7 @@ import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
|
| 7 |
import { AsyncContext } from '../shared';
|
| 8 |
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
| 9 |
import type { Request, Response } from 'express';
|
|
|
|
| 10 |
|
| 11 |
@singleton()
|
| 12 |
export class BraveSearchService extends AsyncService {
|
|
@@ -20,6 +21,7 @@ export class BraveSearchService extends AsyncService {
|
|
| 20 |
protected secretExposer: SecretExposer,
|
| 21 |
protected geoipControl: GeoIPService,
|
| 22 |
protected threadLocal: AsyncContext,
|
|
|
|
| 23 |
) {
|
| 24 |
super(...arguments);
|
| 25 |
}
|
|
@@ -69,6 +71,7 @@ export class BraveSearchService extends AsyncService {
|
|
| 69 |
while (maxTries--) {
|
| 70 |
try {
|
| 71 |
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
|
|
|
|
| 72 |
|
| 73 |
return r.parsed;
|
| 74 |
} catch (err: any) {
|
|
|
|
| 7 |
import { AsyncContext } from '../shared';
|
| 8 |
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
| 9 |
import type { Request, Response } from 'express';
|
| 10 |
+
import { BlackHoleDetector } from './blackhole-detector';
|
| 11 |
|
| 12 |
@singleton()
|
| 13 |
export class BraveSearchService extends AsyncService {
|
|
|
|
| 21 |
protected secretExposer: SecretExposer,
|
| 22 |
protected geoipControl: GeoIPService,
|
| 23 |
protected threadLocal: AsyncContext,
|
| 24 |
+
protected blackHoleDetector: BlackHoleDetector,
|
| 25 |
) {
|
| 26 |
super(...arguments);
|
| 27 |
}
|
|
|
|
| 71 |
while (maxTries--) {
|
| 72 |
try {
|
| 73 |
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
|
| 74 |
+
this.blackHoleDetector.itWorked();
|
| 75 |
|
| 76 |
return r.parsed;
|
| 77 |
} catch (err: any) {
|
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { container, singleton } from 'tsyringe';
|
| 2 |
+
import { AsyncService } from 'civkit/async-service';
|
| 3 |
+
import { Logger, SecretExposer } from '../shared';
|
| 4 |
+
import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare';
|
| 5 |
+
|
| 6 |
+
@singleton()
|
| 7 |
+
export class CFBrowserRendering extends AsyncService {
|
| 8 |
+
|
| 9 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 10 |
+
client!: CloudFlareHTTP;
|
| 11 |
+
|
| 12 |
+
constructor(
|
| 13 |
+
protected globalLogger: Logger,
|
| 14 |
+
protected secretExposer: SecretExposer,
|
| 15 |
+
) {
|
| 16 |
+
super(...arguments);
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
override async init() {
|
| 21 |
+
await this.dependencyReady();
|
| 22 |
+
const [account, key] = this.secretExposer.CLOUD_FLARE_API_KEY?.split(':');
|
| 23 |
+
this.client = new CloudFlareHTTP(account, key);
|
| 24 |
+
|
| 25 |
+
this.emit('ready');
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
async fetchContent(url: string) {
|
| 29 |
+
const r = await this.client.fetchBrowserRenderedHTML({ url });
|
| 30 |
+
|
| 31 |
+
return r.parsed.result;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
const instance = container.resolve(CFBrowserRendering);
|
| 37 |
+
|
| 38 |
+
export default instance;
|
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { marshalErrorLike } from 'civkit/lang';
|
| 2 |
+
import { AsyncService } from 'civkit/async-service';
|
| 3 |
+
import { singleton } from 'tsyringe';
|
| 4 |
+
|
| 5 |
+
import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl';
|
| 6 |
+
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 7 |
+
|
| 8 |
+
import { ScrappingOptions } from './puppeteer';
|
| 9 |
+
import { Logger } from '../shared/services/logger';
|
| 10 |
+
import { AssertionFailureError, FancyFile } from 'civkit';
|
| 11 |
+
import { ServiceBadAttemptError, TempFileManager } from '../shared';
|
| 12 |
+
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
|
| 13 |
+
import { ZSTDDecompress } from 'simple-zstd';
|
| 14 |
+
import _ from 'lodash';
|
| 15 |
+
import { Readable } from 'stream';
|
| 16 |
+
import { AsyncLocalContext } from './async-context';
|
| 17 |
+
|
| 18 |
+
export interface CURLScrappingOptions extends ScrappingOptions {
|
| 19 |
+
method?: string;
|
| 20 |
+
body?: string | Buffer;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
@singleton()
|
| 24 |
+
export class CurlControl extends AsyncService {
|
| 25 |
+
|
| 26 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 27 |
+
|
| 28 |
+
chromeVersion: string = `132`;
|
| 29 |
+
safariVersion: string = `537.36`;
|
| 30 |
+
platform: string = `Linux`;
|
| 31 |
+
ua: string = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`;
|
| 32 |
+
|
| 33 |
+
lifeCycleTrack = new WeakMap();
|
| 34 |
+
|
| 35 |
+
constructor(
|
| 36 |
+
protected globalLogger: Logger,
|
| 37 |
+
protected tempFileManager: TempFileManager,
|
| 38 |
+
protected asyncLocalContext: AsyncLocalContext,
|
| 39 |
+
) {
|
| 40 |
+
super(...arguments);
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
override async init() {
|
| 44 |
+
await this.dependencyReady();
|
| 45 |
+
|
| 46 |
+
if (process.platform === 'darwin') {
|
| 47 |
+
this.platform = `macOS`;
|
| 48 |
+
} else if (process.platform === 'win32') {
|
| 49 |
+
this.platform = `Windows`;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
this.emit('ready');
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
impersonateChrome(ua: string) {
|
| 56 |
+
this.chromeVersion = ua.match(/Chrome\/(\d+)/)![1];
|
| 57 |
+
this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)![1];
|
| 58 |
+
this.ua = ua;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
curlImpersonateHeader(curl: Curl, headers?: object) {
|
| 62 |
+
const mixinHeaders: Record<string, string> = {
|
| 63 |
+
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
|
| 64 |
+
'sec-ch-ua-mobile': '?0',
|
| 65 |
+
'sec-ch-ua-platform': this.platform,
|
| 66 |
+
'Upgrade-Insecure-Requests': '1',
|
| 67 |
+
'User-Agent': this.ua,
|
| 68 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
| 69 |
+
'Sec-Fetch-Site': 'none',
|
| 70 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 71 |
+
'Sec-Fetch-User': '?1',
|
| 72 |
+
'Sec-Fetch-Dest': 'document',
|
| 73 |
+
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
| 74 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
| 75 |
+
};
|
| 76 |
+
const headersCopy: Record<string, string | undefined> = { ...headers };
|
| 77 |
+
for (const k of Object.keys(mixinHeaders)) {
|
| 78 |
+
const lowerK = k.toLowerCase();
|
| 79 |
+
if (headersCopy[lowerK]) {
|
| 80 |
+
mixinHeaders[k] = headersCopy[lowerK];
|
| 81 |
+
delete headersCopy[lowerK];
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
Object.assign(mixinHeaders, headersCopy);
|
| 85 |
+
|
| 86 |
+
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => {
|
| 87 |
+
if (Array.isArray(v) && v.length) {
|
| 88 |
+
return v.map((v2) => `${k}: ${v2}`);
|
| 89 |
+
}
|
| 90 |
+
return [`${k}: ${v}`];
|
| 91 |
+
}));
|
| 92 |
+
|
| 93 |
+
return curl;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
| 97 |
+
return new Promise<{
|
| 98 |
+
statusCode: number,
|
| 99 |
+
data?: FancyFile,
|
| 100 |
+
headers: HeaderInfo[],
|
| 101 |
+
}>((resolve, reject) => {
|
| 102 |
+
let contentType = '';
|
| 103 |
+
const curl = new Curl();
|
| 104 |
+
curl.enable(CurlFeature.StreamResponse);
|
| 105 |
+
curl.setOpt('URL', urlToCrawl.toString());
|
| 106 |
+
curl.setOpt(Curl.option.FOLLOWLOCATION, false);
|
| 107 |
+
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
|
| 108 |
+
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(30_000, crawlOpts?.timeoutMs || 30_000));
|
| 109 |
+
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
|
| 110 |
+
if (crawlOpts?.method) {
|
| 111 |
+
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
|
| 112 |
+
}
|
| 113 |
+
if (crawlOpts?.body) {
|
| 114 |
+
curl.setOpt(Curl.option.POSTFIELDS, crawlOpts.body.toString());
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
const headersToSet = { ...crawlOpts?.extraHeaders };
|
| 118 |
+
if (crawlOpts?.cookies?.length) {
|
| 119 |
+
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${encodeURIComponent(cookie.value)}`);
|
| 120 |
+
headersToSet.cookie ??= cookieChunks.join('; ');
|
| 121 |
+
}
|
| 122 |
+
if (crawlOpts?.referer) {
|
| 123 |
+
headersToSet.referer ??= crawlOpts.referer;
|
| 124 |
+
}
|
| 125 |
+
if (crawlOpts?.overrideUserAgent) {
|
| 126 |
+
headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
this.curlImpersonateHeader(curl, headersToSet);
|
| 130 |
+
|
| 131 |
+
if (crawlOpts?.proxyUrl) {
|
| 132 |
+
const proxyUrlCopy = new URL(crawlOpts.proxyUrl);
|
| 133 |
+
curl.setOpt(Curl.option.PROXY, proxyUrlCopy.href);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
let curlStream: Readable | undefined;
|
| 137 |
+
curl.on('error', (err, errCode) => {
|
| 138 |
+
curl.close();
|
| 139 |
+
this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err: marshalErrorLike(err), urlToCrawl });
|
| 140 |
+
if (curlStream) {
|
| 141 |
+
// For some reason, manually emitting error event is required for curlStream.
|
| 142 |
+
curlStream.emit('error', err);
|
| 143 |
+
curlStream.destroy(err);
|
| 144 |
+
}
|
| 145 |
+
const err2 = this.digestCurlCode(errCode, err.message);
|
| 146 |
+
if (err2) {
|
| 147 |
+
reject(err2);
|
| 148 |
+
return;
|
| 149 |
+
}
|
| 150 |
+
reject(new AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`));
|
| 151 |
+
});
|
| 152 |
+
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
|
| 153 |
+
let status = -1;
|
| 154 |
+
let contentEncoding = '';
|
| 155 |
+
curl.once('end', () => {
|
| 156 |
+
if (curlStream) {
|
| 157 |
+
curlStream.once('end', () => curl.close());
|
| 158 |
+
return;
|
| 159 |
+
}
|
| 160 |
+
curl.close();
|
| 161 |
+
});
|
| 162 |
+
curl.on('stream', (stream, statusCode, headers) => {
|
| 163 |
+
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode });
|
| 164 |
+
status = statusCode;
|
| 165 |
+
curlStream = stream;
|
| 166 |
+
for (const headerSet of (headers as HeaderInfo[])) {
|
| 167 |
+
for (const [k, v] of Object.entries(headerSet)) {
|
| 168 |
+
if (k.trim().endsWith(':')) {
|
| 169 |
+
Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || '');
|
| 170 |
+
Reflect.deleteProperty(headerSet, k);
|
| 171 |
+
continue;
|
| 172 |
+
}
|
| 173 |
+
if (v === undefined) {
|
| 174 |
+
Reflect.set(headerSet, k, '');
|
| 175 |
+
continue;
|
| 176 |
+
}
|
| 177 |
+
if (k.toLowerCase() === 'content-type' && typeof v === 'string') {
|
| 178 |
+
contentType = v.toLowerCase();
|
| 179 |
+
}
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
const lastResHeaders = headers[headers.length - 1];
|
| 183 |
+
for (const [k, v] of Object.entries(lastResHeaders)) {
|
| 184 |
+
const kl = k.toLowerCase();
|
| 185 |
+
if (kl === 'content-type') {
|
| 186 |
+
contentType = v.toLowerCase();
|
| 187 |
+
}
|
| 188 |
+
if (kl === 'content-encoding') {
|
| 189 |
+
contentEncoding = v.toLowerCase();
|
| 190 |
+
}
|
| 191 |
+
if (contentType && contentEncoding) {
|
| 192 |
+
break;
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
if ([301, 302, 307, 308].includes(statusCode)) {
|
| 197 |
+
if (stream) {
|
| 198 |
+
stream.resume();
|
| 199 |
+
}
|
| 200 |
+
resolve({
|
| 201 |
+
statusCode: status,
|
| 202 |
+
data: undefined,
|
| 203 |
+
headers: headers as HeaderInfo[],
|
| 204 |
+
});
|
| 205 |
+
return;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
if (!stream) {
|
| 209 |
+
resolve({
|
| 210 |
+
statusCode: status,
|
| 211 |
+
data: undefined,
|
| 212 |
+
headers: headers as HeaderInfo[],
|
| 213 |
+
});
|
| 214 |
+
return;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
switch (contentEncoding) {
|
| 218 |
+
case 'gzip': {
|
| 219 |
+
const decompressed = createGunzip();
|
| 220 |
+
stream.pipe(decompressed);
|
| 221 |
+
stream.once('error', (err) => {
|
| 222 |
+
decompressed.destroy(err);
|
| 223 |
+
});
|
| 224 |
+
stream = decompressed;
|
| 225 |
+
break;
|
| 226 |
+
}
|
| 227 |
+
case 'deflate': {
|
| 228 |
+
const decompressed = createInflate();
|
| 229 |
+
stream.pipe(decompressed);
|
| 230 |
+
stream.once('error', (err) => {
|
| 231 |
+
decompressed.destroy(err);
|
| 232 |
+
});
|
| 233 |
+
stream = decompressed;
|
| 234 |
+
break;
|
| 235 |
+
}
|
| 236 |
+
case 'br': {
|
| 237 |
+
const decompressed = createBrotliDecompress();
|
| 238 |
+
stream.pipe(decompressed);
|
| 239 |
+
stream.once('error', (err) => {
|
| 240 |
+
decompressed.destroy(err);
|
| 241 |
+
});
|
| 242 |
+
stream = decompressed;
|
| 243 |
+
break;
|
| 244 |
+
}
|
| 245 |
+
case 'zstd': {
|
| 246 |
+
const decompressed = ZSTDDecompress();
|
| 247 |
+
stream.pipe(decompressed);
|
| 248 |
+
stream.once('error', (err) => {
|
| 249 |
+
decompressed.destroy(err);
|
| 250 |
+
});
|
| 251 |
+
stream = decompressed;
|
| 252 |
+
break;
|
| 253 |
+
}
|
| 254 |
+
default: {
|
| 255 |
+
break;
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
const fpath = this.tempFileManager.alloc();
|
| 260 |
+
const fancyFile = FancyFile.auto(stream, fpath);
|
| 261 |
+
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
| 262 |
+
resolve({
|
| 263 |
+
statusCode: status,
|
| 264 |
+
data: fancyFile,
|
| 265 |
+
headers: headers as HeaderInfo[],
|
| 266 |
+
});
|
| 267 |
+
});
|
| 268 |
+
|
| 269 |
+
curl.perform();
|
| 270 |
+
});
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
| 274 |
+
let leftRedirection = 10;
|
| 275 |
+
let opts = { ...crawlOpts };
|
| 276 |
+
let nextHopUrl = urlToCrawl;
|
| 277 |
+
const fakeHeaderInfos: HeaderInfo[] = [];
|
| 278 |
+
do {
|
| 279 |
+
const r = await this.urlToFile1Shot(nextHopUrl, opts);
|
| 280 |
+
|
| 281 |
+
if ([301, 302, 307, 308].includes(r.statusCode)) {
|
| 282 |
+
const headers = r.headers[r.headers.length - 1];
|
| 283 |
+
const location = headers.Location || headers.location;
|
| 284 |
+
if (!location) {
|
| 285 |
+
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`);
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
|
| 289 |
+
if (setCookieHeader) {
|
| 290 |
+
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
|
| 291 |
+
const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
|
| 292 |
+
if (parsed.length) {
|
| 293 |
+
opts.cookies = [...(opts.cookies || []), ...parsed];
|
| 294 |
+
}
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
nextHopUrl = new URL(location, nextHopUrl);
|
| 298 |
+
fakeHeaderInfos.push(...r.headers);
|
| 299 |
+
leftRedirection -= 1;
|
| 300 |
+
continue;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
return {
|
| 304 |
+
statusCode: r.statusCode,
|
| 305 |
+
data: r.data,
|
| 306 |
+
headers: fakeHeaderInfos.concat(r.headers),
|
| 307 |
+
};
|
| 308 |
+
} while (leftRedirection > 0);
|
| 309 |
+
|
| 310 |
+
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Too many redirections.`);
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|
| 314 |
+
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
|
| 315 |
+
|
| 316 |
+
let finalURL = targetUrl;
|
| 317 |
+
const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
|
| 318 |
+
impersonate: {},
|
| 319 |
+
proxyOrigin: {},
|
| 320 |
+
};
|
| 321 |
+
for (const headers of curlResult.headers) {
|
| 322 |
+
sideLoadOpts.impersonate[finalURL.href] = {
|
| 323 |
+
status: headers.result?.code || -1,
|
| 324 |
+
headers: _.omit(headers, 'result'),
|
| 325 |
+
contentType: headers['Content-Type'] || headers['content-type'],
|
| 326 |
+
};
|
| 327 |
+
if (crawlOpts?.proxyUrl) {
|
| 328 |
+
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
|
| 329 |
+
}
|
| 330 |
+
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
|
| 331 |
+
const location = headers.Location || headers.location;
|
| 332 |
+
if (!location) {
|
| 333 |
+
throw new Error(`Bad redirection: ${curlResult.headers.length} times`);
|
| 334 |
+
}
|
| 335 |
+
finalURL = new URL(location, finalURL);
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
|
| 339 |
+
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type']).toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream';
|
| 340 |
+
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
|
| 341 |
+
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
|
| 342 |
+
|
| 343 |
+
if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) {
|
| 344 |
+
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
|
| 348 |
+
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
|
| 349 |
+
|
| 350 |
+
return {
|
| 351 |
+
finalURL,
|
| 352 |
+
sideLoadOpts,
|
| 353 |
+
chain: curlResult.headers,
|
| 354 |
+
status: curlResult.statusCode,
|
| 355 |
+
headers: lastHeaders,
|
| 356 |
+
contentType,
|
| 357 |
+
contentDisposition,
|
| 358 |
+
fileName,
|
| 359 |
+
file: curlResult.data
|
| 360 |
+
};
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
digestCurlCode(code: CurlCode, msg: string) {
|
| 364 |
+
switch (code) {
|
| 365 |
+
// 400 User errors
|
| 366 |
+
case CurlCode.CURLE_GOT_NOTHING:
|
| 367 |
+
case CurlCode.CURLE_COULDNT_RESOLVE_HOST:
|
| 368 |
+
case CurlCode.CURLE_REMOTE_ACCESS_DENIED: {
|
| 369 |
+
return new AssertionFailureError(msg);
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
// Retryable errors
|
| 373 |
+
case CurlCode.CURLE_SSL_CONNECT_ERROR:
|
| 374 |
+
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
|
| 375 |
+
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
|
| 376 |
+
case CurlCode.CURLE_COULDNT_CONNECT:
|
| 377 |
+
case CurlCode.CURLE_PARTIAL_FILE:
|
| 378 |
+
case CurlCode.CURLE_OPERATION_TIMEDOUT: {
|
| 379 |
+
return new ServiceBadAttemptError(msg);
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
default: {
|
| 383 |
+
return undefined;
|
| 384 |
+
}
|
| 385 |
+
}
|
| 386 |
+
}
|
| 387 |
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc';
|
| 2 |
+
import _ from 'lodash';
|
| 3 |
+
import dayjs from 'dayjs';
|
| 4 |
+
import utc from 'dayjs/plugin/utc';
|
| 5 |
+
|
| 6 |
+
dayjs.extend(utc);
|
| 7 |
+
|
| 8 |
+
@StatusCode(50301)
|
| 9 |
+
export class ServiceDisabledError extends ApplicationError { }
|
| 10 |
+
|
| 11 |
+
@StatusCode(50302)
|
| 12 |
+
export class ServiceCrashedError extends ApplicationError { }
|
| 13 |
+
|
| 14 |
+
@StatusCode(50303)
|
| 15 |
+
export class ServiceNodeResourceDrainError extends ApplicationError { }
|
| 16 |
+
|
| 17 |
+
@StatusCode(40104)
|
| 18 |
+
export class EmailUnverifiedError extends ApplicationError { }
|
| 19 |
+
|
| 20 |
+
@StatusCode(40201)
|
| 21 |
+
export class InsufficientCreditsError extends ApplicationError { }
|
| 22 |
+
|
| 23 |
+
@StatusCode(40202)
|
| 24 |
+
export class FreeFeatureLimitError extends ApplicationError { }
|
| 25 |
+
|
| 26 |
+
@StatusCode(40203)
|
| 27 |
+
export class InsufficientBalanceError extends ApplicationError { }
|
| 28 |
+
|
| 29 |
+
@StatusCode(40903)
|
| 30 |
+
export class LockConflictError extends ApplicationError { }
|
| 31 |
+
|
| 32 |
+
@StatusCode(40904)
|
| 33 |
+
export class BudgetExceededError extends ApplicationError { }
|
| 34 |
+
|
| 35 |
+
@StatusCode(45101)
|
| 36 |
+
export class HarmfulContentError extends ApplicationError { }
|
| 37 |
+
|
| 38 |
+
@StatusCode(45102)
|
| 39 |
+
export class SecurityCompromiseError extends ApplicationError { }
|
| 40 |
+
|
| 41 |
+
@StatusCode(41201)
|
| 42 |
+
export class BatchSizeTooLargeError extends ApplicationError { }
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@StatusCode(42903)
|
| 46 |
+
export class RateLimitTriggeredError extends ApplicationError {
|
| 47 |
+
|
| 48 |
+
@Prop({
|
| 49 |
+
desc: 'Retry after seconds',
|
| 50 |
+
})
|
| 51 |
+
retryAfter?: number;
|
| 52 |
+
|
| 53 |
+
@Prop({
|
| 54 |
+
desc: 'Retry after date',
|
| 55 |
+
})
|
| 56 |
+
retryAfterDate?: Date;
|
| 57 |
+
|
| 58 |
+
protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() {
|
| 59 |
+
const retryAfter = this.retryAfter || this.retryAfterDate;
|
| 60 |
+
if (!retryAfter) {
|
| 61 |
+
return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL];
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), {
|
| 65 |
+
headers: {
|
| 66 |
+
'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`,
|
| 67 |
+
}
|
| 68 |
+
});
|
| 69 |
+
}
|
| 70 |
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { AbstractFinalizerService } from 'civkit/finalizer';
|
| 2 |
+
import { container, singleton } from 'tsyringe';
|
| 3 |
+
import { isMainThread } from 'worker_threads';
|
| 4 |
+
import { GlobalLogger } from './logger';
|
| 5 |
+
|
| 6 |
+
@singleton()
|
| 7 |
+
export class FinalizerService extends AbstractFinalizerService {
|
| 8 |
+
|
| 9 |
+
container = container;
|
| 10 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 11 |
+
|
| 12 |
+
constructor(protected globalLogger: GlobalLogger) {
|
| 13 |
+
super(...arguments);
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
const instance = container.resolve(FinalizerService);
|
| 19 |
+
export const { Finalizer } = instance.decorators();
|
| 20 |
+
export default instance;
|
| 21 |
+
|
| 22 |
+
if (isMainThread) {
|
| 23 |
+
instance.serviceReady();
|
| 24 |
+
}
|
|
File without changes
|