nomagick commited on
Commit
23a3b80
Β·
unverified Β·
1 Parent(s): ed80c9a

restructure: nolonger a firebase application (#1160)

Browse files

* fix: fine allow redefining Function.prototype.toString

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* fix: contentType encoding

* wip

* fix: error throwing

* wip

* fix

* wip

* fix

* fix

* fix: jsdom

* wip

* wip

* fix: links summary uniqueness

* wip

* wip

* robots-txt catch no robots.txt

* deps: remove puppeteer-extra-plugin-stealth

* fix: dont change waring type

* fix: curl

* fix: replace firebase-roundtrip-check with blackhole-detector

* fix: black hole detection

* sercher: black hole detecting

* fix: no h2c for searcher

* fix: bhd

* fix: search and crawl conflict

* fix: bhd

* fix

* fix: server script

* canvas: fixed avif issue

* logging: move some to debug

* fix

* fix: pptr declare ready only when page can be created without issues

* fix: bhd

* cd: cloud run deploy-health-check cannot complete pptr newPage

* cd: fix

* fix: curl body can be null

* fix

* fix

* fix: major fix regarding TC pdfs

* fix

* fix

* deps: fix civkit trie router issue

* fix

* boom: total restructure

* cd: fix docker ctx

* fix

* fix: switch to h2c

* cd: ensure http2

This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .github/workflows/cd.yml +5 -7
  2. .gitignore +77 -2
  3. .vscode/launch.json +31 -28
  4. .vscode/tasks.json +3 -129
  5. backend/functions/Dockerfile β†’ Dockerfile +0 -0
  6. README.md +0 -4
  7. backend/.firebaserc +0 -5
  8. backend/.gitignore +0 -79
  9. backend/firebase.json +0 -43
  10. backend/firestore.indexes.json +0 -19
  11. backend/firestore.rules +0 -32
  12. backend/functions/.dockerignore +0 -1
  13. backend/functions/.editorconfig +0 -36
  14. backend/functions/.env.example +0 -0
  15. backend/functions/.puppeteerrc.cjs +0 -9
  16. backend/functions/package.json +0 -93
  17. backend/functions/src/services/curl.ts +0 -218
  18. backend/functions/src/shared +0 -1
  19. backend/storage.rules +0 -8
  20. backend/functions/integrity-check.cjs β†’ integrity-check.cjs +0 -0
  21. backend/functions/package-lock.json β†’ package-lock.json +155 -83
  22. package.json +83 -14
  23. {backend/functions/public β†’ public}/favicon.ico +0 -0
  24. {backend/functions/src/cloud-functions β†’ src/api}/crawler.ts +284 -97
  25. {backend/functions/src/cloud-functions β†’ src/api}/searcher-serper.ts +56 -56
  26. {backend/functions/src/cloud-functions β†’ src/api}/searcher.ts +45 -47
  27. {backend/functions/src β†’ src}/cloud-functions/adaptive-crawler.ts +1 -1
  28. {backend/functions/src β†’ src}/cloud-functions/data-crunching.ts +1 -1
  29. {backend/functions/src β†’ src}/db/adaptive-crawl-task.ts +0 -0
  30. {backend/functions/src β†’ src}/db/crawled.ts +0 -0
  31. {backend/functions/src β†’ src}/db/domain-blockade.ts +0 -0
  32. {backend/functions/src β†’ src}/db/domain-profile.ts +1 -1
  33. {backend/functions/src β†’ src}/db/img-alt.ts +0 -0
  34. {backend/functions/src β†’ src}/db/pdf.ts +0 -0
  35. {backend/functions/src β†’ src}/db/searched.ts +0 -0
  36. {backend/functions/src β†’ src}/dto/adaptive-crawler-options.ts +0 -0
  37. backend/functions/src/dto/scrapping-options.ts β†’ src/dto/crawler-options.ts +61 -46
  38. src/dto/jina-embeddings-auth.ts +216 -0
  39. {backend/functions/src β†’ src}/fetch.d.ts +0 -0
  40. {backend/functions/src β†’ src}/index.ts +0 -0
  41. src/lib/transform-server-event-stream.ts +169 -0
  42. {backend/functions/src β†’ src}/services/alt-text.ts +0 -0
  43. src/services/async-context.ts +10 -0
  44. src/services/blackhole-detector.ts +72 -0
  45. {backend/functions/src β†’ src}/services/brave-search.ts +3 -0
  46. src/services/cf-browser-rendering.ts +38 -0
  47. src/services/curl.ts +387 -0
  48. src/services/errors.ts +70 -0
  49. src/services/finalizer.ts +24 -0
  50. {backend/functions/src β†’ src}/services/geoip.ts +0 -0
.github/workflows/cd.yml CHANGED
@@ -14,9 +14,6 @@ jobs:
14
  concurrency:
15
  group: ${{ github.ref_type == 'branch' && github.ref }}
16
  cancel-in-progress: true
17
- defaults:
18
- run:
19
- working-directory: backend/functions
20
  permissions:
21
  contents: read
22
  steps:
@@ -30,6 +27,8 @@ jobs:
30
  credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
31
  - name: 'Set up Cloud SDK'
32
  uses: 'google-github-actions/setup-gcloud@v2'
 
 
33
  - name: "Docker auth"
34
  run: |-
35
  gcloud auth configure-docker us-docker.pkg.dev --quiet
@@ -40,7 +39,6 @@ jobs:
40
  with:
41
  node-version: 22.12.0
42
  cache: npm
43
- cache-dependency-path: backend/functions/package-lock.json
44
 
45
  - name: npm install
46
  run: npm ci
@@ -65,13 +63,13 @@ jobs:
65
  id: container
66
  uses: docker/build-push-action@v6
67
  with:
68
- context: backend/functions
69
  push: true
70
  tags: ${{ steps.meta.outputs.tags }}
71
  labels: ${{ steps.meta.outputs.labels }}
72
  - name: Deploy CRAWL with Tag
73
  run: |
74
- gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
75
  - name: Deploy SEARCH with Tag
76
  run: |
77
- gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0
 
14
  concurrency:
15
  group: ${{ github.ref_type == 'branch' && github.ref }}
16
  cancel-in-progress: true
 
 
 
17
  permissions:
18
  contents: read
19
  steps:
 
27
  credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
28
  - name: 'Set up Cloud SDK'
29
  uses: 'google-github-actions/setup-gcloud@v2'
30
+ with:
31
+ install_components: beta
32
  - name: "Docker auth"
33
  run: |-
34
  gcloud auth configure-docker us-docker.pkg.dev --quiet
 
39
  with:
40
  node-version: 22.12.0
41
  cache: npm
 
42
 
43
  - name: npm install
44
  run: npm ci
 
63
  id: container
64
  uses: docker/build-push-action@v6
65
  with:
66
+ context: .
67
  push: true
68
  tags: ${{ steps.meta.outputs.tags }}
69
  labels: ${{ steps.meta.outputs.labels }}
70
  - name: Deploy CRAWL with Tag
71
  run: |
72
+ gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
73
  - name: Deploy SEARCH with Tag
74
  run: |
75
+ gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
.gitignore CHANGED
@@ -1,4 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  node_modules/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  .DS_Store
3
- /package-lock.json
4
- backend/functions/test.js
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ firebase-debug.log*
8
+ firebase-debug.*.log*
9
+
10
+ # Firebase cache
11
+ .firebase/
12
+
13
+ # Firebase config
14
+
15
+ # Uncomment this if you'd like others to create their own Firebase project.
16
+ # For a team working on the same Firebase project(s), it is recommended to leave
17
+ # it commented so all members can deploy to the same project(s) in .firebaserc.
18
+ # .firebaserc
19
+
20
+ # Runtime data
21
+ pids
22
+ *.pid
23
+ *.seed
24
+ *.pid.lock
25
+
26
+ # Directory for instrumented libs generated by jscoverage/JSCover
27
+ lib-cov
28
+
29
+ # Coverage directory used by tools like istanbul
30
+ coverage
31
+
32
+ # nyc test coverage
33
+ .nyc_output
34
+
35
+ # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
36
+ .grunt
37
+
38
+ # Bower dependency directory (https://bower.io/)
39
+ bower_components
40
+
41
+ # node-waf configuration
42
+ .lock-wscript
43
+
44
+ # Compiled binary addons (http://nodejs.org/api/addons.html)
45
+ build/Release
46
+
47
+ # Dependency directories
48
  node_modules/
49
+
50
+ # Optional npm cache directory
51
+ .npm
52
+
53
+ # Optional eslint cache
54
+ .eslintcache
55
+
56
+ # Optional REPL history
57
+ .node_repl_history
58
+
59
+ # Output of 'npm pack'
60
+ *.tgz
61
+
62
+ # Yarn Integrity file
63
+ .yarn-integrity
64
+
65
+ # dotenv environment variables file
66
+ .env
67
+ .secret.local
68
+
69
+ toy*.ts
70
+
71
  .DS_Store
72
+ build/
73
+ .firebase-emu/
74
+ *.log
75
+ .DS_Store
76
+
77
+ *.local
78
+ .secret.*
79
+ licensed/
.vscode/launch.json CHANGED
@@ -1,26 +1,6 @@
1
  {
2
  "version": "0.2.0",
3
  "configurations": [
4
- {
5
- "name": "Debug Fullstack: attach",
6
- "request": "attach",
7
- "cwd": "${workspaceFolder}/backend/functions",
8
- "skipFiles": [
9
- "<node_internals>/**"
10
- ],
11
- "type": "node",
12
- "preLaunchTask": "Fullstack:debug"
13
- },
14
- {
15
- "name": "Debug Fullstack: attach: with proxy",
16
- "request": "attach",
17
- "cwd": "${workspaceFolder}/backend/functions",
18
- "skipFiles": [
19
- "<node_internals>/**"
20
- ],
21
- "type": "node",
22
- "preLaunchTask": "Fullstack:debug:with-proxy"
23
- },
24
  {
25
  "name": "Attach",
26
  "port": 9229,
@@ -40,21 +20,44 @@
40
  "type": "node"
41
  },
42
  {
43
- "name": "Debug Fullstack",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "request": "launch",
45
  "runtimeArgs": [
46
- "emulators:start",
47
- "--import=../.firebase-emu",
48
- "--export-on-exit=../.firebase-emu",
49
  ],
50
- "cwd": "${workspaceFolder}/backend/functions",
51
- "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
 
 
 
 
52
  "skipFiles": [
53
  "<node_internals>/**"
54
  ],
55
  "type": "node",
56
- "preLaunchTask": "Fullstack:prepare",
57
- "killBehavior": "polite"
 
58
  },
59
  ]
60
  }
 
1
  {
2
  "version": "0.2.0",
3
  "configurations": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  {
5
  "name": "Attach",
6
  "port": 9229,
 
20
  "type": "node"
21
  },
22
  {
23
+ "name": "Debug Stand Alone Crawl",
24
+ "request": "launch",
25
+ "runtimeArgs": [
26
+ "--env-file=.secret.local",
27
+ ],
28
+ "env": {
29
+ "GCLOUD_PROJECT": "reader-6b7dc",
30
+ "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
31
+ },
32
+ "cwd": "${workspaceFolder}",
33
+ "program": "build/stand-alone/crawl.js",
34
+ "skipFiles": [
35
+ "<node_internals>/**"
36
+ ],
37
+ "type": "node",
38
+ "outputCapture": "std",
39
+ "preLaunchTask": "Backend:build:watch",
40
+ "killBehavior": "forceful"
41
+ },
42
+ {
43
+ "name": "Debug Stand Alone Search",
44
  "request": "launch",
45
  "runtimeArgs": [
46
+ "--env-file=.secret.local",
 
 
47
  ],
48
+ "env": {
49
+ "GCLOUD_PROJECT": "reader-6b7dc",
50
+ "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
51
+ },
52
+ "cwd": "${workspaceFolder}",
53
+ "program": "build/stand-alone/search.js",
54
  "skipFiles": [
55
  "<node_internals>/**"
56
  ],
57
  "type": "node",
58
+ "outputCapture": "std",
59
+ "preLaunchTask": "Backend:build:watch",
60
+ "killBehavior": "forceful"
61
  },
62
  ]
63
  }
.vscode/tasks.json CHANGED
@@ -6,29 +6,18 @@
6
  "script": "build",
7
  "group": "build",
8
  "options": {
9
- "cwd": "${workspaceFolder}/backend/functions"
10
  },
11
  "problemMatcher": [],
12
  "label": "Backend:rebuild",
13
  "detail": "Backend:rebuild"
14
  },
15
- {
16
- "type": "npm",
17
- "script": "emu:reset",
18
- "group": "build",
19
- "options": {
20
- "cwd": "${workspaceFolder}/backend/functions"
21
- },
22
- "problemMatcher": [],
23
- "label": "Backend:reset-emulator",
24
- "detail": "Backend:reset-emulator"
25
- },
26
  {
27
  "type": "typescript",
28
  "options": {
29
- "cwd": "${workspaceFolder}/backend/functions"
30
  },
31
- "tsconfig": "backend/functions/tsconfig.json",
32
  "option": "watch",
33
  "isBackground": true,
34
  "problemMatcher": [
@@ -36,121 +25,6 @@
36
  ],
37
  "group": "build",
38
  "label": "Backend:build:watch"
39
- },
40
- {
41
- "type": "npm",
42
- "script": "emu:debug",
43
- "group": "none",
44
- "options": {
45
- "cwd": "${workspaceFolder}/backend/functions"
46
- },
47
- "problemMatcher": [
48
- {
49
- "base": "$tsc",
50
- "background": {
51
- "activeOnStart": false,
52
- "beginsPattern": "shutdown requested|Starting emulators",
53
- "endsPattern": "Debugger listening"
54
- }
55
- }
56
- ],
57
- "label": "Backend:start-emulator-debug",
58
- "detail": "Backend:start-emulator-debug",
59
- "dependsOn": [
60
- "Backend:build:watch"
61
- ],
62
- "isBackground": true,
63
- },
64
- {
65
- "type": "npm",
66
- "script": "dev",
67
- "options": {
68
- "cwd": "${workspaceFolder}/webapp",
69
- },
70
- "group": "build",
71
- "label": "Frontend:start:dev",
72
- "detail": "Frontend:start:dev",
73
- "isBackground": true,
74
- "problemMatcher": {
75
- "base": "$vite",
76
- "background": {
77
- "activeOnStart": true,
78
- "endsPattern": "OK",
79
- "beginsPattern": "vite"
80
- }
81
- },
82
- },
83
- {
84
- "type": "npm",
85
- "script": "dev",
86
- "options": {
87
- "cwd": "${workspaceFolder}/webapp",
88
- "env": {
89
- "FIREBASE_EMULATE": "true",
90
- }
91
- },
92
- "group": "build",
93
- "label": "Frontend:start:emu",
94
- "detail": "Frontend:start:emu",
95
- "isBackground": true,
96
- "problemMatcher": {
97
- "base": "$vite",
98
- "background": {
99
- "activeOnStart": true,
100
- "endsPattern": "OK",
101
- "beginsPattern": "vite"
102
- }
103
- },
104
- },
105
- {
106
- "type": "npm",
107
- "script": "emu:debug2",
108
- "group": "none",
109
- "options": {
110
- "cwd": "${workspaceFolder}/backend/functions",
111
- "env": {
112
- "https_proxy": "http://127.0.0.1:7890",
113
- "http_proxy": "http://127.0.0.1:7890",
114
- "all_proxy": "socks5://127.0.0.1:7890"
115
- }
116
- },
117
- "problemMatcher": [
118
- {
119
- "base": "$tsc",
120
- "background": {
121
- "activeOnStart": false,
122
- "beginsPattern": "shutdown requested|Starting emulators",
123
- "endsPattern": "Debugger listening"
124
- }
125
- }
126
- ],
127
- "label": "Backend:start-emulator-debug:with-proxy",
128
- "detail": "Backend:start-emulator-debug:with-proxy",
129
- "dependsOn": [
130
- "Backend:build:watch"
131
- ],
132
- "isBackground": true,
133
- },
134
- {
135
- "label": "Fullstack:prepare",
136
- "dependsOn": [
137
- "Frontend:start:emu",
138
- "Backend:build:watch",
139
- ],
140
- },
141
- {
142
- "label": "Fullstack:debug",
143
- "dependsOn": [
144
- // "Frontend:start:emu",
145
- "Backend:start-emulator-debug",
146
- ],
147
- },
148
- {
149
- "label": "Fullstack:debug:with-proxy",
150
- "dependsOn": [
151
- "Frontend:start:emu",
152
- "Backend:start-emulator-debug:with-proxy",
153
- ],
154
  }
155
  ]
156
  }
 
6
  "script": "build",
7
  "group": "build",
8
  "options": {
9
+ "cwd": "${workspaceFolder}"
10
  },
11
  "problemMatcher": [],
12
  "label": "Backend:rebuild",
13
  "detail": "Backend:rebuild"
14
  },
 
 
 
 
 
 
 
 
 
 
 
15
  {
16
  "type": "typescript",
17
  "options": {
18
+ "cwd": "${workspaceFolder}"
19
  },
20
+ "tsconfig": "tsconfig.json",
21
  "option": "watch",
22
  "isBackground": true,
23
  "problemMatcher": [
 
25
  ],
26
  "group": "build",
27
  "label": "Backend:build:watch"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  }
29
  ]
30
  }
backend/functions/Dockerfile β†’ Dockerfile RENAMED
File without changes
README.md CHANGED
@@ -158,13 +158,9 @@ curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.or
158
 
159
  You will need the following tools to run the project:
160
  - Node v18 (The build fails for Node version >18)
161
- - Firebase CLI (`npm install -g firebase-tools`)
162
-
163
- For backend, go to the `backend/functions` directory and install the npm dependencies.
164
 
165
  ```bash
166
  git clone git@github.com:jina-ai/reader.git
167
- cd backend/functions
168
  npm install
169
  ```
170
 
 
158
 
159
  You will need the following tools to run the project:
160
  - Node v18 (The build fails for Node version >18)
 
 
 
161
 
162
  ```bash
163
  git clone git@github.com:jina-ai/reader.git
 
164
  npm install
165
  ```
166
 
backend/.firebaserc DELETED
@@ -1,5 +0,0 @@
1
- {
2
- "projects": {
3
- "default": "reader-6b7dc"
4
- }
5
- }
 
 
 
 
 
 
backend/.gitignore DELETED
@@ -1,79 +0,0 @@
1
- # Logs
2
- logs
3
- *.log
4
- npm-debug.log*
5
- yarn-debug.log*
6
- yarn-error.log*
7
- firebase-debug.log*
8
- firebase-debug.*.log*
9
-
10
- # Firebase cache
11
- .firebase/
12
-
13
- # Firebase config
14
-
15
- # Uncomment this if you'd like others to create their own Firebase project.
16
- # For a team working on the same Firebase project(s), it is recommended to leave
17
- # it commented so all members can deploy to the same project(s) in .firebaserc.
18
- # .firebaserc
19
-
20
- # Runtime data
21
- pids
22
- *.pid
23
- *.seed
24
- *.pid.lock
25
-
26
- # Directory for instrumented libs generated by jscoverage/JSCover
27
- lib-cov
28
-
29
- # Coverage directory used by tools like istanbul
30
- coverage
31
-
32
- # nyc test coverage
33
- .nyc_output
34
-
35
- # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
36
- .grunt
37
-
38
- # Bower dependency directory (https://bower.io/)
39
- bower_components
40
-
41
- # node-waf configuration
42
- .lock-wscript
43
-
44
- # Compiled binary addons (http://nodejs.org/api/addons.html)
45
- build/Release
46
-
47
- # Dependency directories
48
- node_modules/
49
-
50
- # Optional npm cache directory
51
- .npm
52
-
53
- # Optional eslint cache
54
- .eslintcache
55
-
56
- # Optional REPL history
57
- .node_repl_history
58
-
59
- # Output of 'npm pack'
60
- *.tgz
61
-
62
- # Yarn Integrity file
63
- .yarn-integrity
64
-
65
- # dotenv environment variables file
66
- .env
67
- .secret.local
68
-
69
- toy*.ts
70
-
71
- .DS_Store
72
- build/
73
- .firebase-emu/
74
- *.log
75
- .DS_Store
76
-
77
- *.local
78
- .secret.*
79
- licensed/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/firebase.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "firestore": {
3
- "rules": "firestore.rules",
4
- "indexes": "firestore.indexes.json"
5
- },
6
- "functions": [
7
- {
8
- "source": "functions",
9
- "codebase": "default",
10
- "ignore": [
11
- "node_modules",
12
- "src",
13
- ".git",
14
- "*.log",
15
- "*.local",
16
- ".secret.*",
17
- ".firebase-emu"
18
- ],
19
- "predeploy": [
20
- "npm --prefix \"$RESOURCE_DIR\" run build:clean",
21
- "npm --prefix \"$RESOURCE_DIR\" run build"
22
- ]
23
- }
24
- ],
25
- "storage": {
26
- "rules": "storage.rules"
27
- },
28
- "emulators": {
29
- "ui": {
30
- "enabled": true
31
- },
32
- "singleProjectMode": true,
33
- "functions": {
34
- "port": 5001
35
- },
36
- "firestore": {
37
- "port": 9098
38
- },
39
- "storage": {
40
- "port": 9097
41
- }
42
- }
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/firestore.indexes.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "indexes": [
3
- {
4
- "collectionGroup": "prompts",
5
- "queryScope": "COLLECTION_GROUP",
6
- "fields": [
7
- {
8
- "fieldPath": "id",
9
- "order": "ASCENDING"
10
- },
11
- {
12
- "fieldPath": "isPublic",
13
- "order": "ASCENDING"
14
- }
15
- ]
16
- }
17
- ],
18
- "fieldOverrides": []
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/firestore.rules DELETED
@@ -1,32 +0,0 @@
1
- rules_version = '2';
2
- service cloud.firestore {
3
- match /databases/{database}/documents {
4
- // match /questions/{document=**} {
5
- // allow read: if request.auth != null
6
- // }
7
-
8
- // match /answers/{userId}/profiles/default {
9
- // allow read, write: if request.auth != null && request.auth.uid == userId
10
- // }
11
-
12
- match /credits/{userId}/{document=**} {
13
- allow read: if request.auth != null && request.auth.uid == userId
14
- }
15
-
16
- match /users/{userId}/prompts/{document=**} {
17
- allow read: if request.auth != null && request.auth.uid == userId
18
- }
19
-
20
- // match /users/{userId}/profiles/{document=**} {
21
- // allow read: if request.auth != null && request.auth.uid == userId
22
- // }
23
-
24
- match /users/{userId}/creditHistory/{document=**} {
25
- allow read: if request.auth != null && request.auth.uid == userId
26
- }
27
-
28
- match /{document=**} {
29
- allow read, write: if false;
30
- }
31
- }
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/functions/.dockerignore DELETED
@@ -1 +0,0 @@
1
- node_modules/
 
 
backend/functions/.editorconfig DELETED
@@ -1,36 +0,0 @@
1
- root = true
2
-
3
- [*]
4
- end_of_line = lf
5
- charset = utf-8
6
- indent_style = space
7
- insert_final_newline = true
8
- trim_trailing_whitespace = true
9
- indent_size = 4
10
- quote_type = single
11
- max_line_length = 120
12
-
13
- [*.py]
14
- indent_size = 4
15
-
16
- [*.ts]
17
- indent_size = 4
18
-
19
- [*.js]
20
- indent_size = 2
21
-
22
- [*.vue]
23
- indent_size = 2
24
-
25
- [*.*sx]
26
- indent_size = 2
27
-
28
- [*.*ml]
29
- indent_size = 2
30
-
31
- [*.json]
32
- indent_size = 2
33
-
34
- [*.md]
35
- indent_size = 2
36
- trim_trailing_whitespace = false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/functions/.env.example DELETED
File without changes
backend/functions/.puppeteerrc.cjs DELETED
@@ -1,9 +0,0 @@
1
- const { join } = require('path');
2
-
3
- /**
4
- * @type {import("puppeteer").Configuration}
5
- */
6
- module.exports = {
7
- // Changes the cache location for Puppeteer.
8
- cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
9
- };
 
 
 
 
 
 
 
 
 
 
backend/functions/package.json DELETED
@@ -1,93 +0,0 @@
1
- {
2
- "name": "reader",
3
- "scripts": {
4
- "lint": "eslint --ext .js,.ts .",
5
- "build": "node ./integrity-check.cjs && tsc -p .",
6
- "build:watch": "tsc --watch",
7
- "build:clean": "rm -rf ./build",
8
- "shell": "npm run build && firebase functions:shell",
9
- "emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu",
10
- "emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../",
11
- "emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit",
12
- "emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
13
- "emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
14
- "emu:kill": "killall java",
15
- "serve": "npm run build && npm run emu:start",
16
- "debug": "npm run build && npm run emu:start -- --inspect-functions",
17
- "from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit",
18
- "from-preset": "npm run build && npm run emu:reset && npm run emu:start",
19
- "start": "npm run shell",
20
- "deploy": "firebase deploy --only functions",
21
- "logs": "firebase functions:log",
22
- "gcp-build": "node node_modules/puppeteer/install.mjs"
23
- },
24
- "engines": {
25
- "node": "20"
26
- },
27
- "main": "build/index.js",
28
- "dependencies": {
29
- "@esm2cjs/normalize-url": "^8.0.0",
30
- "@google-cloud/translate": "^8.2.0",
31
- "@mozilla/readability": "^0.5.0",
32
- "@napi-rs/canvas": "^0.1.67",
33
- "@types/turndown": "^5.0.4",
34
- "@xmldom/xmldom": "^0.9.3",
35
- "archiver": "^6.0.1",
36
- "axios": "^1.3.3",
37
- "bcrypt": "^5.1.0",
38
- "busboy": "^1.6.0",
39
- "civkit": "^0.8.3-3e69606",
40
- "core-js": "^3.37.1",
41
- "cors": "^2.8.5",
42
- "dayjs": "^1.11.9",
43
- "express": "^4.19.2",
44
- "firebase-admin": "^12.1.0",
45
- "firebase-functions": "^6.1.1",
46
- "htmlparser2": "^9.0.0",
47
- "jose": "^5.1.0",
48
- "langdetect": "^0.2.1",
49
- "linkedom": "^0.18.4",
50
- "maxmind": "^4.3.18",
51
- "minio": "^7.1.3",
52
- "node-libcurl": "^4.1.0",
53
- "openai": "^4.20.0",
54
- "pdfjs-dist": "^4.2.67",
55
- "puppeteer": "^23.3.0",
56
- "puppeteer-extra": "^3.3.6",
57
- "puppeteer-extra-plugin-block-resources": "^2.4.3",
58
- "puppeteer-extra-plugin-page-proxy": "^1.3.1",
59
- "puppeteer-page-proxy": "^1.3.0",
60
- "robots-parser": "^3.0.1",
61
- "set-cookie-parser": "^2.6.0",
62
- "simple-zstd": "^1.4.2",
63
- "stripe": "^11.11.0",
64
- "tiktoken": "^1.0.16",
65
- "tld-extract": "^2.1.0",
66
- "turndown": "^7.1.3",
67
- "turndown-plugin-gfm": "^1.0.2",
68
- "undici": "^5.24.0"
69
- },
70
- "devDependencies": {
71
- "@types/archiver": "^5.3.4",
72
- "@types/bcrypt": "^5.0.0",
73
- "@types/busboy": "^1.5.4",
74
- "@types/cors": "^2.8.17",
75
- "@types/generic-pool": "^3.8.1",
76
- "@types/node": "^20.14.13",
77
- "@types/set-cookie-parser": "^2.4.7",
78
- "@types/xmldom": "^0.1.34",
79
- "@typescript-eslint/eslint-plugin": "^5.12.0",
80
- "@typescript-eslint/parser": "^5.12.0",
81
- "eslint": "^8.9.0",
82
- "eslint-config-google": "^0.14.0",
83
- "eslint-plugin-import": "^2.25.4",
84
- "firebase-functions-test": "^3.0.0",
85
- "pino-pretty": "^13.0.0",
86
- "replicate": "^0.16.1",
87
- "typescript": "^5.5.4"
88
- },
89
- "private": true,
90
- "exports": {
91
- ".": "./build/index.js"
92
- }
93
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/functions/src/services/curl.ts DELETED
@@ -1,218 +0,0 @@
1
- import { marshalErrorLike } from 'civkit/lang';
2
- import { AsyncService } from 'civkit/async-service';
3
- import { singleton } from 'tsyringe';
4
-
5
- import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
6
- import { PageSnapshot, ScrappingOptions } from './puppeteer';
7
- import { Logger } from '../shared/services/logger';
8
- import { JSDomControl } from './jsdom';
9
- import { AssertionFailureError, FancyFile } from 'civkit';
10
- import { TempFileManager } from '../shared';
11
- import { readFile } from 'fs/promises';
12
- import { pathToFileURL } from 'url';
13
- import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
14
- import { ZSTDDecompress } from 'simple-zstd';
15
-
16
- @singleton()
17
- export class CurlControl extends AsyncService {
18
-
19
- logger = this.globalLogger.child({ service: this.constructor.name });
20
-
21
- constructor(
22
- protected globalLogger: Logger,
23
- protected jsdomControl: JSDomControl,
24
- protected tempFileManager: TempFileManager,
25
- ) {
26
- super(...arguments);
27
- }
28
-
29
- override async init() {
30
- await this.dependencyReady();
31
-
32
- this.emit('ready');
33
- }
34
-
35
- curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
36
- const mixinHeaders = {
37
- 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
38
- 'sec-ch-ua-mobile': '?0',
39
- 'sec-ch-ua-platform': 'Windows',
40
- 'Upgrade-Insecure-Requests': '1',
41
- 'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
42
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
43
- 'Sec-Fetch-Site': 'none',
44
- 'Sec-Fetch-Mode': 'navigate',
45
- 'Sec-Fetch-User': '?1',
46
- 'Sec-Fetch-Dest': 'document',
47
- 'Accept-Encoding': 'gzip, deflate, br, zstd',
48
- 'Accept-Language': 'en-US,en;q=0.9',
49
- };
50
-
51
- curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
52
-
53
- return curl;
54
- }
55
-
56
- async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
57
- const snapshot = {
58
- href: urlToCrawl.toString(),
59
- html: '',
60
- title: '',
61
- text: '',
62
- } as PageSnapshot;
63
-
64
- let contentType = '';
65
- const result = await new Promise<{
66
- statusCode: number,
67
- data?: FancyFile,
68
- headers: Buffer | HeaderInfo[],
69
- }>((resolve, reject) => {
70
- const curl = new Curl();
71
- curl.enable(CurlFeature.StreamResponse);
72
- curl.setOpt('URL', urlToCrawl.toString());
73
- curl.setOpt(Curl.option.FOLLOWLOCATION, true);
74
-
75
- curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
76
-
77
- if (crawlOpts?.overrideUserAgent) {
78
- curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
79
- }
80
-
81
- this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
82
- // if (crawlOpts?.extraHeaders) {
83
- // curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
84
- // }
85
- if (crawlOpts?.proxyUrl) {
86
- curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
87
- }
88
- if (crawlOpts?.cookies?.length) {
89
- const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`);
90
- curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; '));
91
- }
92
- if (crawlOpts?.referer) {
93
- curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
94
- }
95
-
96
- curl.on('end', (statusCode, _data, headers) => {
97
- this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
98
- curl.close();
99
- });
100
-
101
- curl.on('error', (err) => {
102
- curl.close();
103
- this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
104
- reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
105
- });
106
- curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
107
- let status = -1;
108
- let contentEncoding = '';
109
- curl.on('stream', (stream, statusCode, headers) => {
110
- status = statusCode;
111
- const lastResHeaders = headers[headers.length - 1];
112
- for (const [k, v] of Object.entries(lastResHeaders)) {
113
- const kl = k.toLowerCase();
114
- if (kl === 'content-type') {
115
- contentType = v.toLowerCase();
116
- }
117
- if (kl === 'content-encoding') {
118
- contentEncoding = v.toLowerCase();
119
- }
120
- if (contentType && contentEncoding) {
121
- break;
122
- }
123
- }
124
-
125
- if (!contentType) {
126
- reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
127
- stream.destroy();
128
- return;
129
- }
130
- if (contentType.startsWith('image/')) {
131
- snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
132
- stream.destroy();
133
- resolve({
134
- statusCode: status,
135
- headers,
136
- });
137
- return;
138
- }
139
-
140
- switch (contentEncoding) {
141
- case 'gzip': {
142
- const decompressed = createGunzip();
143
- stream.pipe(decompressed);
144
- stream = decompressed;
145
- break;
146
- }
147
- case 'deflate': {
148
- const decompressed = createInflate();
149
- stream.pipe(decompressed);
150
- stream = decompressed;
151
- break;
152
- }
153
- case 'br': {
154
- const decompressed = createBrotliDecompress();
155
- stream.pipe(decompressed);
156
- stream = decompressed;
157
- break;
158
- }
159
- case 'zstd': {
160
- const decompressed = ZSTDDecompress();
161
- stream.pipe(decompressed);
162
- stream = decompressed;
163
- break;
164
- }
165
- default: {
166
- break;
167
- }
168
- }
169
-
170
- const fpath = this.tempFileManager.alloc();
171
- const fancyFile = FancyFile.auto(stream, fpath);
172
- this.tempFileManager.bindPathTo(fancyFile, fpath);
173
- resolve({
174
- statusCode: status,
175
- data: fancyFile,
176
- headers,
177
- });
178
- });
179
-
180
- curl.perform();
181
- });
182
-
183
- if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
184
- throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
185
- }
186
-
187
- if (contentType === 'application/octet-stream') {
188
- // Content declared as binary is same as unknown.
189
- contentType = '';
190
- }
191
-
192
- if (result.data) {
193
- const mimeType: string = contentType || await result.data.mimeType;
194
- if (mimeType.startsWith('text/html')) {
195
- if ((await result.data.size) > 1024 * 1024 * 32) {
196
- throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
197
- }
198
- snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
199
- } else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
200
- if ((await result.data.size) > 1024 * 1024 * 32) {
201
- throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
202
- }
203
- snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
204
- snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
205
- } else if (mimeType.startsWith('application/pdf')) {
206
- snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
207
- } else {
208
- throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
209
- }
210
- }
211
-
212
- const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
213
-
214
- return curlSnapshot!;
215
- }
216
-
217
-
218
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/functions/src/shared DELETED
@@ -1 +0,0 @@
1
- ../../../thinapps-shared/backend
 
 
backend/storage.rules DELETED
@@ -1,8 +0,0 @@
1
- rules_version = '2';
2
- service firebase.storage {
3
- match /b/{bucket}/o {
4
- match /{allPaths=**} {
5
- allow read, write: if false;
6
- }
7
- }
8
- }
 
 
 
 
 
 
 
 
 
backend/functions/integrity-check.cjs β†’ integrity-check.cjs RENAMED
File without changes
backend/functions/package-lock.json β†’ package-lock.json RENAMED
@@ -8,15 +8,16 @@
8
  "dependencies": {
9
  "@esm2cjs/normalize-url": "^8.0.0",
10
  "@google-cloud/translate": "^8.2.0",
 
11
  "@mozilla/readability": "^0.5.0",
12
- "@napi-rs/canvas": "^0.1.67",
13
  "@types/turndown": "^5.0.4",
14
  "@xmldom/xmldom": "^0.9.3",
15
  "archiver": "^6.0.1",
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
  "busboy": "^1.6.0",
19
- "civkit": "^0.8.3-3e69606",
20
  "core-js": "^3.37.1",
21
  "cors": "^2.8.5",
22
  "dayjs": "^1.11.9",
@@ -31,7 +32,7 @@
31
  "minio": "^7.1.3",
32
  "node-libcurl": "^4.1.0",
33
  "openai": "^4.20.0",
34
- "pdfjs-dist": "^4.2.67",
35
  "puppeteer": "^23.3.0",
36
  "puppeteer-extra": "^3.3.6",
37
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
@@ -53,6 +54,7 @@
53
  "@types/busboy": "^1.5.4",
54
  "@types/cors": "^2.8.17",
55
  "@types/generic-pool": "^3.8.1",
 
56
  "@types/node": "^20.14.13",
57
  "@types/set-cookie-parser": "^2.4.7",
58
  "@types/xmldom": "^0.1.34",
@@ -62,6 +64,7 @@
62
  "eslint-config-google": "^0.14.0",
63
  "eslint-plugin-import": "^2.25.4",
64
  "firebase-functions-test": "^3.0.0",
 
65
  "pino-pretty": "^13.0.0",
66
  "replicate": "^0.16.1",
67
  "typescript": "^5.5.4"
@@ -1626,6 +1629,23 @@
1626
  "url": "https://opencollective.com/js-sdsl"
1627
  }
1628
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1629
  "node_modules/@koa/router": {
1630
  "version": "12.0.1",
1631
  "resolved": "https://registry.npmjs.org/@koa/router/-/router-12.0.1.tgz",
@@ -1679,30 +1699,30 @@
1679
  }
1680
  },
1681
  "node_modules/@napi-rs/canvas": {
1682
- "version": "0.1.67",
1683
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.67.tgz",
1684
- "integrity": "sha512-VA4Khm/5Kg2bQGx3jXotTC4MloOG8b1Ung80exafUK0k5u6yJmIz3Q2iXeeWZs5weV+LQOEB+CPKsYwEYaGAjw==",
1685
  "license": "MIT",
1686
  "engines": {
1687
  "node": ">= 10"
1688
  },
1689
  "optionalDependencies": {
1690
- "@napi-rs/canvas-android-arm64": "0.1.67",
1691
- "@napi-rs/canvas-darwin-arm64": "0.1.67",
1692
- "@napi-rs/canvas-darwin-x64": "0.1.67",
1693
- "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.67",
1694
- "@napi-rs/canvas-linux-arm64-gnu": "0.1.67",
1695
- "@napi-rs/canvas-linux-arm64-musl": "0.1.67",
1696
- "@napi-rs/canvas-linux-riscv64-gnu": "0.1.67",
1697
- "@napi-rs/canvas-linux-x64-gnu": "0.1.67",
1698
- "@napi-rs/canvas-linux-x64-musl": "0.1.67",
1699
- "@napi-rs/canvas-win32-x64-msvc": "0.1.67"
1700
  }
1701
  },
1702
  "node_modules/@napi-rs/canvas-android-arm64": {
1703
- "version": "0.1.67",
1704
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.67.tgz",
1705
- "integrity": "sha512-W+3DFG5h0WU8Vqqb3W5fNmm5/TPH5ECZRinQDK4CAKFSUkc4iZcDwrmyFG9sB4KdHazf1mFVHCpEeVMO6Mk6Zg==",
1706
  "cpu": [
1707
  "arm64"
1708
  ],
@@ -1716,9 +1736,9 @@
1716
  }
1717
  },
1718
  "node_modules/@napi-rs/canvas-darwin-arm64": {
1719
- "version": "0.1.67",
1720
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.67.tgz",
1721
- "integrity": "sha512-xzrv7QboI47yhIHR5P5u/9KGswokuOKLiKSukr1Ku03RRJxP6lGuVtrAZAgdRg7F9FsuF2REf2yK53YVb6pMlA==",
1722
  "cpu": [
1723
  "arm64"
1724
  ],
@@ -1732,9 +1752,9 @@
1732
  }
1733
  },
1734
  "node_modules/@napi-rs/canvas-darwin-x64": {
1735
- "version": "0.1.67",
1736
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.67.tgz",
1737
- "integrity": "sha512-SNk9lYBr84N0gW8MZ2IrjygFtbFBILr3SEqMdHzHHuph20SQmssFvJGPZwSSCMEyKAvyqhogbmlew0te5Z4w9Q==",
1738
  "cpu": [
1739
  "x64"
1740
  ],
@@ -1748,9 +1768,9 @@
1748
  }
1749
  },
1750
  "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
1751
- "version": "0.1.67",
1752
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.67.tgz",
1753
- "integrity": "sha512-qmBlSvUpl567bzH8tNXi82u5FrL4d0qINqd6K9O7GWGGGFmKMJdrgi2/SW3wwCTxqHBasIDdVWc4KSJfwyaoDQ==",
1754
  "cpu": [
1755
  "arm"
1756
  ],
@@ -1764,9 +1784,9 @@
1764
  }
1765
  },
1766
  "node_modules/@napi-rs/canvas-linux-arm64-gnu": {
1767
- "version": "0.1.67",
1768
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.67.tgz",
1769
- "integrity": "sha512-k3nAPQefkMeFuJ65Rqdnx92KX1JXQhEKjjWeKsCJB+7sIBgQUWtHo9c3etfVLv5pkWJJDFi/Zc2soNkH3E8dRA==",
1770
  "cpu": [
1771
  "arm64"
1772
  ],
@@ -1780,9 +1800,9 @@
1780
  }
1781
  },
1782
  "node_modules/@napi-rs/canvas-linux-arm64-musl": {
1783
- "version": "0.1.67",
1784
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.67.tgz",
1785
- "integrity": "sha512-lZwHWR1cCP408l86n3Qbs3X1oFeAYMjJIQvQl1VMZh6wo5PfI+jaZSKBUOd8x44TnVllX9yhLY9unNRztk/sUQ==",
1786
  "cpu": [
1787
  "arm64"
1788
  ],
@@ -1796,9 +1816,9 @@
1796
  }
1797
  },
1798
  "node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
1799
- "version": "0.1.67",
1800
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.67.tgz",
1801
- "integrity": "sha512-PdBC9p6bLHA1W3OdA0vTHj701SB/kioGQ1uCFBRMs5KBCaMLb/H4aNi8uaIUIEvBWnxeAjoNcLU7//q0FxEosw==",
1802
  "cpu": [
1803
  "riscv64"
1804
  ],
@@ -1812,9 +1832,9 @@
1812
  }
1813
  },
1814
  "node_modules/@napi-rs/canvas-linux-x64-gnu": {
1815
- "version": "0.1.67",
1816
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.67.tgz",
1817
- "integrity": "sha512-kJJX6eWzjipL/LdKOWCJctc88e5yzuXri8+s0V/lN06OwuLGW62TWS3lvi8qlUrGMOfRGabSWWlB4omhASSB8w==",
1818
  "cpu": [
1819
  "x64"
1820
  ],
@@ -1828,9 +1848,9 @@
1828
  }
1829
  },
1830
  "node_modules/@napi-rs/canvas-linux-x64-musl": {
1831
- "version": "0.1.67",
1832
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.67.tgz",
1833
- "integrity": "sha512-jLKiPWGeN6ZzhnaLG7ex7eexsiHJ1mdtPK1qKvETIcu45dApMXyUIHvdL6XWB5gFFtj5ScHzLUxv1vkfPZsoxA==",
1834
  "cpu": [
1835
  "x64"
1836
  ],
@@ -1844,9 +1864,9 @@
1844
  }
1845
  },
1846
  "node_modules/@napi-rs/canvas-win32-x64-msvc": {
1847
- "version": "0.1.67",
1848
- "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.67.tgz",
1849
- "integrity": "sha512-K/JmkOFbc4iRZYUqJhj0jwqfHA/wNQEmTiGNsgZ6d59yF/IBNp5T0D5eg3B8ghjI8GxDYCiSJ6DNX8mC3Oh2EQ==",
1850
  "cpu": [
1851
  "x64"
1852
  ],
@@ -2238,6 +2258,16 @@
2238
  "resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
2239
  "integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
2240
  },
 
 
 
 
 
 
 
 
 
 
2241
  "node_modules/@types/archiver": {
2242
  "version": "5.3.4",
2243
  "resolved": "https://registry.npmjs.org/@types/archiver/-/archiver-5.3.4.tgz",
@@ -2344,6 +2374,26 @@
2344
  "@types/node": "*"
2345
  }
2346
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2347
  "node_modules/@types/cors": {
2348
  "version": "2.8.17",
2349
  "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
@@ -2403,6 +2453,13 @@
2403
  "@types/node": "*"
2404
  }
2405
  },
 
 
 
 
 
 
 
2406
  "node_modules/@types/http-cache-semantics": {
2407
  "version": "4.0.4",
2408
  "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
@@ -2460,6 +2517,13 @@
2460
  "@types/node": "*"
2461
  }
2462
  },
 
 
 
 
 
 
 
2463
  "node_modules/@types/keyv": {
2464
  "version": "3.1.4",
2465
  "resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
@@ -2468,6 +2532,33 @@
2468
  "@types/node": "*"
2469
  }
2470
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2471
  "node_modules/@types/lodash": {
2472
  "version": "4.17.0",
2473
  "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
@@ -3836,7 +3927,6 @@
3836
  "version": "1.0.1",
3837
  "resolved": "https://registry.npmjs.org/cache-content-type/-/cache-content-type-1.0.1.tgz",
3838
  "integrity": "sha512-IKufZ1o4Ut42YUrZSo8+qnMTrFuKkvyoLXUywKz9GJ5BrhOFGhLdkx9sG4KAnVvbY6kEcSFjLQul+DVmBm2bgA==",
3839
- "optional": true,
3840
  "dependencies": {
3841
  "mime-types": "^2.1.18",
3842
  "ylru": "^1.2.0"
@@ -4005,9 +4095,10 @@
4005
  }
4006
  },
4007
  "node_modules/civkit": {
4008
- "version": "0.8.3-3e69606",
4009
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.3-3e69606.tgz",
4010
- "integrity": "sha512-niV5U11ySIiVNSnGpW49KJlExmIiuQQfnyQEXeYuKCE+B+wkqYCBG+3tlY3E882tmPkaQQKpDlF/yTeqEU2q2Q==",
 
4011
  "dependencies": {
4012
  "lodash": "^4.17.21",
4013
  "tslib": "^2.5.0"
@@ -4138,7 +4229,6 @@
4138
  "version": "4.6.0",
4139
  "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
4140
  "integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
4141
- "devOptional": true,
4142
  "engines": {
4143
  "iojs": ">= 1.0.0",
4144
  "node": ">= 0.12.0"
@@ -4148,7 +4238,6 @@
4148
  "version": "6.1.0",
4149
  "resolved": "https://registry.npmjs.org/co-body/-/co-body-6.1.0.tgz",
4150
  "integrity": "sha512-m7pOT6CdLN7FuXUcpuz/8lfQ/L77x8SchHCF4G0RBTJO20Wzmhn5Sp4/5WsKy8OSpifBSUrmg83qEqaDHdyFuQ==",
4151
- "optional": true,
4152
  "dependencies": {
4153
  "inflation": "^2.0.0",
4154
  "qs": "^6.5.2",
@@ -4273,7 +4362,6 @@
4273
  "version": "0.9.1",
4274
  "resolved": "https://registry.npmjs.org/cookies/-/cookies-0.9.1.tgz",
4275
  "integrity": "sha512-TG2hpqe4ELx54QER/S3HQ9SRVnQnGBtKUz5bLQWtYAQ+o6GpgMs6sYUvaiJjVxb+UXwhRhAEP3m7LbsIZ77Hmw==",
4276
- "optional": true,
4277
  "dependencies": {
4278
  "depd": "~2.0.0",
4279
  "keygrip": "~1.1.0"
@@ -4582,8 +4670,7 @@
4582
  "node_modules/deep-equal": {
4583
  "version": "1.0.1",
4584
  "resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz",
4585
- "integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw==",
4586
- "optional": true
4587
  },
4588
  "node_modules/deep-extend": {
4589
  "version": "0.6.0",
@@ -6701,7 +6788,6 @@
6701
  "version": "1.5.0",
6702
  "resolved": "https://registry.npmjs.org/http-assert/-/http-assert-1.5.0.tgz",
6703
  "integrity": "sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==",
6704
- "optional": true,
6705
  "dependencies": {
6706
  "deep-equal": "~1.0.1",
6707
  "http-errors": "~1.8.0"
@@ -6714,7 +6800,6 @@
6714
  "version": "1.1.2",
6715
  "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
6716
  "integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
6717
- "optional": true,
6718
  "engines": {
6719
  "node": ">= 0.6"
6720
  }
@@ -6723,7 +6808,6 @@
6723
  "version": "1.8.1",
6724
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
6725
  "integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
6726
- "optional": true,
6727
  "dependencies": {
6728
  "depd": "~1.1.2",
6729
  "inherits": "2.0.4",
@@ -6739,7 +6823,6 @@
6739
  "version": "1.5.0",
6740
  "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
6741
  "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
6742
- "optional": true,
6743
  "engines": {
6744
  "node": ">= 0.6"
6745
  }
@@ -6940,7 +7023,6 @@
6940
  "version": "2.1.0",
6941
  "resolved": "https://registry.npmjs.org/inflation/-/inflation-2.1.0.tgz",
6942
  "integrity": "sha512-t54PPJHG1Pp7VQvxyVCJ9mBbjG3Hqryges9bXoOO6GExCPa+//i/d5GSuFtpx3ALLd7lgIAur6zrIlBQyJuMlQ==",
6943
- "optional": true,
6944
  "engines": {
6945
  "node": ">= 0.8.0"
6946
  }
@@ -8316,7 +8398,6 @@
8316
  "version": "1.1.0",
8317
  "resolved": "https://registry.npmjs.org/keygrip/-/keygrip-1.1.0.tgz",
8318
  "integrity": "sha512-iYSchDJ+liQ8iwbSI2QqsQOvqv58eJCEanyJPJi+Khyu8smkcKSFUCbPwzFcL7YVtZ6eONjqRX/38caJ7QjRAQ==",
8319
- "optional": true,
8320
  "dependencies": {
8321
  "tsscmp": "1.0.6"
8322
  },
@@ -8354,10 +8435,10 @@
8354
  }
8355
  },
8356
  "node_modules/koa": {
8357
- "version": "2.15.3",
8358
- "resolved": "https://registry.npmjs.org/koa/-/koa-2.15.3.tgz",
8359
- "integrity": "sha512-j/8tY9j5t+GVMLeioLaxweJiKUayFhlGqNTzf2ZGwL0ZCQijd2RLHK0SLW5Tsko8YyyqCZC2cojIb0/s62qTAg==",
8360
- "optional": true,
8361
  "dependencies": {
8362
  "accepts": "^1.3.5",
8363
  "cache-content-type": "^1.0.0",
@@ -8404,14 +8485,12 @@
8404
  "node_modules/koa-compose": {
8405
  "version": "4.1.0",
8406
  "resolved": "https://registry.npmjs.org/koa-compose/-/koa-compose-4.1.0.tgz",
8407
- "integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw==",
8408
- "optional": true
8409
  },
8410
  "node_modules/koa-convert": {
8411
  "version": "2.0.0",
8412
  "resolved": "https://registry.npmjs.org/koa-convert/-/koa-convert-2.0.0.tgz",
8413
  "integrity": "sha512-asOvN6bFlSnxewce2e/DK3p4tltyfC4VM7ZwuTuepI7dEQVcvpyFuBcEARu1+Hxg8DIwytce2n7jrZtRlPrARA==",
8414
- "optional": true,
8415
  "dependencies": {
8416
  "co": "^4.6.0",
8417
  "koa-compose": "^4.1.0"
@@ -8424,7 +8503,6 @@
8424
  "version": "1.8.1",
8425
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
8426
  "integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
8427
- "optional": true,
8428
  "dependencies": {
8429
  "depd": "~1.1.2",
8430
  "inherits": "2.0.4",
@@ -8440,7 +8518,6 @@
8440
  "version": "1.1.2",
8441
  "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
8442
  "integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
8443
- "optional": true,
8444
  "engines": {
8445
  "node": ">= 0.6"
8446
  }
@@ -8449,7 +8526,6 @@
8449
  "version": "1.5.0",
8450
  "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
8451
  "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
8452
- "optional": true,
8453
  "engines": {
8454
  "node": ">= 0.6"
8455
  }
@@ -8644,8 +8720,7 @@
8644
  "node_modules/lodash.merge": {
8645
  "version": "4.6.2",
8646
  "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
8647
- "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
8648
- "dev": true
8649
  },
8650
  "node_modules/lodash.once": {
8651
  "version": "4.1.1",
@@ -9853,8 +9928,7 @@
9853
  "node_modules/only": {
9854
  "version": "0.0.2",
9855
  "resolved": "https://registry.npmjs.org/only/-/only-0.0.2.tgz",
9856
- "integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ==",
9857
- "optional": true
9858
  },
9859
  "node_modules/openai": {
9860
  "version": "4.33.0",
@@ -10118,15 +10192,15 @@
10118
  }
10119
  },
10120
  "node_modules/pdfjs-dist": {
10121
- "version": "4.2.67",
10122
- "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz",
10123
- "integrity": "sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==",
 
10124
  "engines": {
10125
- "node": ">=18"
10126
  },
10127
  "optionalDependencies": {
10128
- "canvas": "^2.11.2",
10129
- "path2d": "^0.2.0"
10130
  }
10131
  },
10132
  "node_modules/peek-stream": {
@@ -12443,7 +12517,6 @@
12443
  "version": "1.0.6",
12444
  "resolved": "https://registry.npmjs.org/tsscmp/-/tsscmp-1.0.6.tgz",
12445
  "integrity": "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA==",
12446
- "optional": true,
12447
  "engines": {
12448
  "node": ">=0.6.x"
12449
  }
@@ -13136,7 +13209,6 @@
13136
  "version": "1.4.0",
13137
  "resolved": "https://registry.npmjs.org/ylru/-/ylru-1.4.0.tgz",
13138
  "integrity": "sha512-2OQsPNEmBCvXuFlIni/a+Rn+R2pHW9INm0BxXJ4hVDA8TirqMj+J/Rp9ItLatT/5pZqWwefVrTQcHpixsxnVlA==",
13139
- "optional": true,
13140
  "engines": {
13141
  "node": ">= 4.0.0"
13142
  }
 
8
  "dependencies": {
9
  "@esm2cjs/normalize-url": "^8.0.0",
10
  "@google-cloud/translate": "^8.2.0",
11
+ "@koa/bodyparser": "^5.1.1",
12
  "@mozilla/readability": "^0.5.0",
13
+ "@napi-rs/canvas": "^0.1.68",
14
  "@types/turndown": "^5.0.4",
15
  "@xmldom/xmldom": "^0.9.3",
16
  "archiver": "^6.0.1",
17
  "axios": "^1.3.3",
18
  "bcrypt": "^5.1.0",
19
  "busboy": "^1.6.0",
20
+ "civkit": "^0.8.4-32482a3",
21
  "core-js": "^3.37.1",
22
  "cors": "^2.8.5",
23
  "dayjs": "^1.11.9",
 
32
  "minio": "^7.1.3",
33
  "node-libcurl": "^4.1.0",
34
  "openai": "^4.20.0",
35
+ "pdfjs-dist": "^4.10.38",
36
  "puppeteer": "^23.3.0",
37
  "puppeteer-extra": "^3.3.6",
38
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
 
54
  "@types/busboy": "^1.5.4",
55
  "@types/cors": "^2.8.17",
56
  "@types/generic-pool": "^3.8.1",
57
+ "@types/koa": "^2.15.0",
58
  "@types/node": "^20.14.13",
59
  "@types/set-cookie-parser": "^2.4.7",
60
  "@types/xmldom": "^0.1.34",
 
64
  "eslint-config-google": "^0.14.0",
65
  "eslint-plugin-import": "^2.25.4",
66
  "firebase-functions-test": "^3.0.0",
67
+ "koa": "^2.16.0",
68
  "pino-pretty": "^13.0.0",
69
  "replicate": "^0.16.1",
70
  "typescript": "^5.5.4"
 
1629
  "url": "https://opencollective.com/js-sdsl"
1630
  }
1631
  },
1632
+ "node_modules/@koa/bodyparser": {
1633
+ "version": "5.1.1",
1634
+ "resolved": "https://registry.npmjs.org/@koa/bodyparser/-/bodyparser-5.1.1.tgz",
1635
+ "integrity": "sha512-ZBF49xqNVxnmJ+8iXegq+fXPQm9RSX8giNl/aXS5rW1VpNct92wnFbGR/47vfoRJVLARGQ4HVL4WaQ0u8IJVoA==",
1636
+ "license": "MIT",
1637
+ "dependencies": {
1638
+ "co-body": "^6.1.0",
1639
+ "lodash.merge": "^4.6.2",
1640
+ "type-is": "^1.6.18"
1641
+ },
1642
+ "engines": {
1643
+ "node": ">= 16"
1644
+ },
1645
+ "peerDependencies": {
1646
+ "koa": "^2.14.1"
1647
+ }
1648
+ },
1649
  "node_modules/@koa/router": {
1650
  "version": "12.0.1",
1651
  "resolved": "https://registry.npmjs.org/@koa/router/-/router-12.0.1.tgz",
 
1699
  }
1700
  },
1701
  "node_modules/@napi-rs/canvas": {
1702
+ "version": "0.1.68",
1703
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.68.tgz",
1704
+ "integrity": "sha512-LQESrePLEBLvhuFkXx9jjBXRC2ClYsO5mqQ1m/puth5z9SOuM3N/B3vDuqnC3RJFktDktyK9khGvo7dTkqO9uQ==",
1705
  "license": "MIT",
1706
  "engines": {
1707
  "node": ">= 10"
1708
  },
1709
  "optionalDependencies": {
1710
+ "@napi-rs/canvas-android-arm64": "0.1.68",
1711
+ "@napi-rs/canvas-darwin-arm64": "0.1.68",
1712
+ "@napi-rs/canvas-darwin-x64": "0.1.68",
1713
+ "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.68",
1714
+ "@napi-rs/canvas-linux-arm64-gnu": "0.1.68",
1715
+ "@napi-rs/canvas-linux-arm64-musl": "0.1.68",
1716
+ "@napi-rs/canvas-linux-riscv64-gnu": "0.1.68",
1717
+ "@napi-rs/canvas-linux-x64-gnu": "0.1.68",
1718
+ "@napi-rs/canvas-linux-x64-musl": "0.1.68",
1719
+ "@napi-rs/canvas-win32-x64-msvc": "0.1.68"
1720
  }
1721
  },
1722
  "node_modules/@napi-rs/canvas-android-arm64": {
1723
+ "version": "0.1.68",
1724
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.68.tgz",
1725
+ "integrity": "sha512-h1KcSR4LKLfRfzeBH65xMxbWOGa1OtMFQbCMVlxPCkN1Zr+2gK+70pXO5ktojIYcUrP6KDcOwoc8clho5ccM/w==",
1726
  "cpu": [
1727
  "arm64"
1728
  ],
 
1736
  }
1737
  },
1738
  "node_modules/@napi-rs/canvas-darwin-arm64": {
1739
+ "version": "0.1.68",
1740
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.68.tgz",
1741
+ "integrity": "sha512-/VURlrAD4gDoxW1GT/b0nP3fRz/fhxmHI/xznTq2FTwkQLPOlLkDLCvTmQ7v6LtGKdc2Ed6rvYpRan+JXThInQ==",
1742
  "cpu": [
1743
  "arm64"
1744
  ],
 
1752
  }
1753
  },
1754
  "node_modules/@napi-rs/canvas-darwin-x64": {
1755
+ "version": "0.1.68",
1756
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.68.tgz",
1757
+ "integrity": "sha512-tEpvGR6vCLTo1Tx9wmDnoOKROpw57wiCWwCpDOuVlj/7rqEJOUYr9ixW4aRJgmeGBrZHgevI0EURys2ER6whmg==",
1758
  "cpu": [
1759
  "x64"
1760
  ],
 
1768
  }
1769
  },
1770
  "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
1771
+ "version": "0.1.68",
1772
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.68.tgz",
1773
+ "integrity": "sha512-U9xbJsumPOiAYeAFZMlHf62b9dGs2HJ6Q5xt7xTB0uEyPeurwhgYBWGgabdsEidyj38YuzI/c3LGBbSQB3vagw==",
1774
  "cpu": [
1775
  "arm"
1776
  ],
 
1784
  }
1785
  },
1786
  "node_modules/@napi-rs/canvas-linux-arm64-gnu": {
1787
+ "version": "0.1.68",
1788
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.68.tgz",
1789
+ "integrity": "sha512-KFkn8wEm3mPnWD4l8+OUUkxylSJuN5q9PnJRZJgv15RtCA1bgxIwTkBhI/+xuyVMcHqON9sXq7cDkEJtHm35dg==",
1790
  "cpu": [
1791
  "arm64"
1792
  ],
 
1800
  }
1801
  },
1802
  "node_modules/@napi-rs/canvas-linux-arm64-musl": {
1803
+ "version": "0.1.68",
1804
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.68.tgz",
1805
+ "integrity": "sha512-IQzts91rCdOALXBWQxLZRCEDrfFTGDtNRJMNu+2SKZ1uT8cmPQkPwVk5rycvFpvgAcmiFiOSCp1aRrlfU8KPpQ==",
1806
  "cpu": [
1807
  "arm64"
1808
  ],
 
1816
  }
1817
  },
1818
  "node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
1819
+ "version": "0.1.68",
1820
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.68.tgz",
1821
+ "integrity": "sha512-e9AS5UttoIKqXSmBzKZdd3NErSVyOEYzJfNOCGtafGk1//gibTwQXGlSXmAKuErqMp09pyk9aqQRSYzm1AQfBw==",
1822
  "cpu": [
1823
  "riscv64"
1824
  ],
 
1832
  }
1833
  },
1834
  "node_modules/@napi-rs/canvas-linux-x64-gnu": {
1835
+ "version": "0.1.68",
1836
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.68.tgz",
1837
+ "integrity": "sha512-Pa/I36VE3j57I3Obhrr+J48KGFfkZk2cJN/2NmW/vCgmoF7kCP6aTVq5n+cGdGWLd/cN9CJ9JvNwEoMRDghu0g==",
1838
  "cpu": [
1839
  "x64"
1840
  ],
 
1848
  }
1849
  },
1850
  "node_modules/@napi-rs/canvas-linux-x64-musl": {
1851
+ "version": "0.1.68",
1852
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.68.tgz",
1853
+ "integrity": "sha512-9c6rkc5195wNxuUHJdf4/mmnq433OQey9TNvQ9LspJazvHbfSkTij8wtKjASVQsJyPDva4fkWOeV/OQ7cLw0GQ==",
1854
  "cpu": [
1855
  "x64"
1856
  ],
 
1864
  }
1865
  },
1866
  "node_modules/@napi-rs/canvas-win32-x64-msvc": {
1867
+ "version": "0.1.68",
1868
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.68.tgz",
1869
+ "integrity": "sha512-Fc5Dez23u0FoSATurT6/w1oMytiRnKWEinHivdMvXpge6nG4YvhrASrtqMk8dGJMVQpHr8QJYF45rOrx2YU2Aw==",
1870
  "cpu": [
1871
  "x64"
1872
  ],
 
2258
  "resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
2259
  "integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
2260
  },
2261
+ "node_modules/@types/accepts": {
2262
+ "version": "1.3.7",
2263
+ "resolved": "https://registry.npmjs.org/@types/accepts/-/accepts-1.3.7.tgz",
2264
+ "integrity": "sha512-Pay9fq2lM2wXPWbteBsRAGiWH2hig4ZE2asK+mm7kUzlxRTfL961rj89I6zV/E3PcIkDqyuBEcMxFT7rccugeQ==",
2265
+ "dev": true,
2266
+ "license": "MIT",
2267
+ "dependencies": {
2268
+ "@types/node": "*"
2269
+ }
2270
+ },
2271
  "node_modules/@types/archiver": {
2272
  "version": "5.3.4",
2273
  "resolved": "https://registry.npmjs.org/@types/archiver/-/archiver-5.3.4.tgz",
 
2374
  "@types/node": "*"
2375
  }
2376
  },
2377
+ "node_modules/@types/content-disposition": {
2378
+ "version": "0.5.8",
2379
+ "resolved": "https://registry.npmjs.org/@types/content-disposition/-/content-disposition-0.5.8.tgz",
2380
+ "integrity": "sha512-QVSSvno3dE0MgO76pJhmv4Qyi/j0Yk9pBp0Y7TJ2Tlj+KCgJWY6qX7nnxCOLkZ3VYRSIk1WTxCvwUSdx6CCLdg==",
2381
+ "dev": true,
2382
+ "license": "MIT"
2383
+ },
2384
+ "node_modules/@types/cookies": {
2385
+ "version": "0.9.0",
2386
+ "resolved": "https://registry.npmjs.org/@types/cookies/-/cookies-0.9.0.tgz",
2387
+ "integrity": "sha512-40Zk8qR147RABiQ7NQnBzWzDcjKzNrntB5BAmeGCb2p/MIyOE+4BVvc17wumsUqUw00bJYqoXFHYygQnEFh4/Q==",
2388
+ "dev": true,
2389
+ "license": "MIT",
2390
+ "dependencies": {
2391
+ "@types/connect": "*",
2392
+ "@types/express": "*",
2393
+ "@types/keygrip": "*",
2394
+ "@types/node": "*"
2395
+ }
2396
+ },
2397
  "node_modules/@types/cors": {
2398
  "version": "2.8.17",
2399
  "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
 
2453
  "@types/node": "*"
2454
  }
2455
  },
2456
+ "node_modules/@types/http-assert": {
2457
+ "version": "1.5.6",
2458
+ "resolved": "https://registry.npmjs.org/@types/http-assert/-/http-assert-1.5.6.tgz",
2459
+ "integrity": "sha512-TTEwmtjgVbYAzZYWyeHPrrtWnfVkm8tQkP8P21uQifPgMRgjrow3XDEYqucuC8SKZJT7pUnhU/JymvjggxO9vw==",
2460
+ "dev": true,
2461
+ "license": "MIT"
2462
+ },
2463
  "node_modules/@types/http-cache-semantics": {
2464
  "version": "4.0.4",
2465
  "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
 
2517
  "@types/node": "*"
2518
  }
2519
  },
2520
+ "node_modules/@types/keygrip": {
2521
+ "version": "1.0.6",
2522
+ "resolved": "https://registry.npmjs.org/@types/keygrip/-/keygrip-1.0.6.tgz",
2523
+ "integrity": "sha512-lZuNAY9xeJt7Bx4t4dx0rYCDqGPW8RXhQZK1td7d4H6E9zYbLoOtjBvfwdTKpsyxQI/2jv+armjX/RW+ZNpXOQ==",
2524
+ "dev": true,
2525
+ "license": "MIT"
2526
+ },
2527
  "node_modules/@types/keyv": {
2528
  "version": "3.1.4",
2529
  "resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
 
2532
  "@types/node": "*"
2533
  }
2534
  },
2535
+ "node_modules/@types/koa": {
2536
+ "version": "2.15.0",
2537
+ "resolved": "https://registry.npmjs.org/@types/koa/-/koa-2.15.0.tgz",
2538
+ "integrity": "sha512-7QFsywoE5URbuVnG3loe03QXuGajrnotr3gQkXcEBShORai23MePfFYdhz90FEtBBpkyIYQbVD+evKtloCgX3g==",
2539
+ "dev": true,
2540
+ "license": "MIT",
2541
+ "dependencies": {
2542
+ "@types/accepts": "*",
2543
+ "@types/content-disposition": "*",
2544
+ "@types/cookies": "*",
2545
+ "@types/http-assert": "*",
2546
+ "@types/http-errors": "*",
2547
+ "@types/keygrip": "*",
2548
+ "@types/koa-compose": "*",
2549
+ "@types/node": "*"
2550
+ }
2551
+ },
2552
+ "node_modules/@types/koa-compose": {
2553
+ "version": "3.2.8",
2554
+ "resolved": "https://registry.npmjs.org/@types/koa-compose/-/koa-compose-3.2.8.tgz",
2555
+ "integrity": "sha512-4Olc63RY+MKvxMwVknCUDhRQX1pFQoBZ/lXcRLP69PQkEpze/0cr8LNqJQe5NFb/b19DWi2a5bTi2VAlQzhJuA==",
2556
+ "dev": true,
2557
+ "license": "MIT",
2558
+ "dependencies": {
2559
+ "@types/koa": "*"
2560
+ }
2561
+ },
2562
  "node_modules/@types/lodash": {
2563
  "version": "4.17.0",
2564
  "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
 
3927
  "version": "1.0.1",
3928
  "resolved": "https://registry.npmjs.org/cache-content-type/-/cache-content-type-1.0.1.tgz",
3929
  "integrity": "sha512-IKufZ1o4Ut42YUrZSo8+qnMTrFuKkvyoLXUywKz9GJ5BrhOFGhLdkx9sG4KAnVvbY6kEcSFjLQul+DVmBm2bgA==",
 
3930
  "dependencies": {
3931
  "mime-types": "^2.1.18",
3932
  "ylru": "^1.2.0"
 
4095
  }
4096
  },
4097
  "node_modules/civkit": {
4098
+ "version": "0.8.4-32482a3",
4099
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-32482a3.tgz",
4100
+ "integrity": "sha512-VQwRreeVKYEoSMlhwYrPGpAA5na6lrIavGKmYNrhsHVJEvSfgkWKEete/btZzer4+WBxnNRw+PpRPrq6xjt13Q==",
4101
+ "license": "AGPL",
4102
  "dependencies": {
4103
  "lodash": "^4.17.21",
4104
  "tslib": "^2.5.0"
 
4229
  "version": "4.6.0",
4230
  "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
4231
  "integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
 
4232
  "engines": {
4233
  "iojs": ">= 1.0.0",
4234
  "node": ">= 0.12.0"
 
4238
  "version": "6.1.0",
4239
  "resolved": "https://registry.npmjs.org/co-body/-/co-body-6.1.0.tgz",
4240
  "integrity": "sha512-m7pOT6CdLN7FuXUcpuz/8lfQ/L77x8SchHCF4G0RBTJO20Wzmhn5Sp4/5WsKy8OSpifBSUrmg83qEqaDHdyFuQ==",
 
4241
  "dependencies": {
4242
  "inflation": "^2.0.0",
4243
  "qs": "^6.5.2",
 
4362
  "version": "0.9.1",
4363
  "resolved": "https://registry.npmjs.org/cookies/-/cookies-0.9.1.tgz",
4364
  "integrity": "sha512-TG2hpqe4ELx54QER/S3HQ9SRVnQnGBtKUz5bLQWtYAQ+o6GpgMs6sYUvaiJjVxb+UXwhRhAEP3m7LbsIZ77Hmw==",
 
4365
  "dependencies": {
4366
  "depd": "~2.0.0",
4367
  "keygrip": "~1.1.0"
 
4670
  "node_modules/deep-equal": {
4671
  "version": "1.0.1",
4672
  "resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz",
4673
+ "integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw=="
 
4674
  },
4675
  "node_modules/deep-extend": {
4676
  "version": "0.6.0",
 
6788
  "version": "1.5.0",
6789
  "resolved": "https://registry.npmjs.org/http-assert/-/http-assert-1.5.0.tgz",
6790
  "integrity": "sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==",
 
6791
  "dependencies": {
6792
  "deep-equal": "~1.0.1",
6793
  "http-errors": "~1.8.0"
 
6800
  "version": "1.1.2",
6801
  "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
6802
  "integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
 
6803
  "engines": {
6804
  "node": ">= 0.6"
6805
  }
 
6808
  "version": "1.8.1",
6809
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
6810
  "integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
 
6811
  "dependencies": {
6812
  "depd": "~1.1.2",
6813
  "inherits": "2.0.4",
 
6823
  "version": "1.5.0",
6824
  "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
6825
  "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
 
6826
  "engines": {
6827
  "node": ">= 0.6"
6828
  }
 
7023
  "version": "2.1.0",
7024
  "resolved": "https://registry.npmjs.org/inflation/-/inflation-2.1.0.tgz",
7025
  "integrity": "sha512-t54PPJHG1Pp7VQvxyVCJ9mBbjG3Hqryges9bXoOO6GExCPa+//i/d5GSuFtpx3ALLd7lgIAur6zrIlBQyJuMlQ==",
 
7026
  "engines": {
7027
  "node": ">= 0.8.0"
7028
  }
 
8398
  "version": "1.1.0",
8399
  "resolved": "https://registry.npmjs.org/keygrip/-/keygrip-1.1.0.tgz",
8400
  "integrity": "sha512-iYSchDJ+liQ8iwbSI2QqsQOvqv58eJCEanyJPJi+Khyu8smkcKSFUCbPwzFcL7YVtZ6eONjqRX/38caJ7QjRAQ==",
 
8401
  "dependencies": {
8402
  "tsscmp": "1.0.6"
8403
  },
 
8435
  }
8436
  },
8437
  "node_modules/koa": {
8438
+ "version": "2.16.0",
8439
+ "resolved": "https://registry.npmjs.org/koa/-/koa-2.16.0.tgz",
8440
+ "integrity": "sha512-Afhqq0Vq3W7C+/rW6IqHVBDLzqObwZ07JaUNUEF8yCQ6afiyFE3RAy+i7V0E46XOWlH7vPWn/x0vsZwNy6PWxw==",
8441
+ "license": "MIT",
8442
  "dependencies": {
8443
  "accepts": "^1.3.5",
8444
  "cache-content-type": "^1.0.0",
 
8485
  "node_modules/koa-compose": {
8486
  "version": "4.1.0",
8487
  "resolved": "https://registry.npmjs.org/koa-compose/-/koa-compose-4.1.0.tgz",
8488
+ "integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw=="
 
8489
  },
8490
  "node_modules/koa-convert": {
8491
  "version": "2.0.0",
8492
  "resolved": "https://registry.npmjs.org/koa-convert/-/koa-convert-2.0.0.tgz",
8493
  "integrity": "sha512-asOvN6bFlSnxewce2e/DK3p4tltyfC4VM7ZwuTuepI7dEQVcvpyFuBcEARu1+Hxg8DIwytce2n7jrZtRlPrARA==",
 
8494
  "dependencies": {
8495
  "co": "^4.6.0",
8496
  "koa-compose": "^4.1.0"
 
8503
  "version": "1.8.1",
8504
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
8505
  "integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
 
8506
  "dependencies": {
8507
  "depd": "~1.1.2",
8508
  "inherits": "2.0.4",
 
8518
  "version": "1.1.2",
8519
  "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
8520
  "integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
 
8521
  "engines": {
8522
  "node": ">= 0.6"
8523
  }
 
8526
  "version": "1.5.0",
8527
  "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
8528
  "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
 
8529
  "engines": {
8530
  "node": ">= 0.6"
8531
  }
 
8720
  "node_modules/lodash.merge": {
8721
  "version": "4.6.2",
8722
  "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
8723
+ "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ=="
 
8724
  },
8725
  "node_modules/lodash.once": {
8726
  "version": "4.1.1",
 
9928
  "node_modules/only": {
9929
  "version": "0.0.2",
9930
  "resolved": "https://registry.npmjs.org/only/-/only-0.0.2.tgz",
9931
+ "integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ=="
 
9932
  },
9933
  "node_modules/openai": {
9934
  "version": "4.33.0",
 
10192
  }
10193
  },
10194
  "node_modules/pdfjs-dist": {
10195
+ "version": "4.10.38",
10196
+ "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.10.38.tgz",
10197
+ "integrity": "sha512-/Y3fcFrXEAsMjJXeL9J8+ZG9U01LbuWaYypvDW2ycW1jL269L3js3DVBjDJ0Up9Np1uqDXsDrRihHANhZOlwdQ==",
10198
+ "license": "Apache-2.0",
10199
  "engines": {
10200
+ "node": ">=20"
10201
  },
10202
  "optionalDependencies": {
10203
+ "@napi-rs/canvas": "^0.1.65"
 
10204
  }
10205
  },
10206
  "node_modules/peek-stream": {
 
12517
  "version": "1.0.6",
12518
  "resolved": "https://registry.npmjs.org/tsscmp/-/tsscmp-1.0.6.tgz",
12519
  "integrity": "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA==",
 
12520
  "engines": {
12521
  "node": ">=0.6.x"
12522
  }
 
13209
  "version": "1.4.0",
13210
  "resolved": "https://registry.npmjs.org/ylru/-/ylru-1.4.0.tgz",
13211
  "integrity": "sha512-2OQsPNEmBCvXuFlIni/a+Rn+R2pHW9INm0BxXJ4hVDA8TirqMj+J/Rp9ItLatT/5pZqWwefVrTQcHpixsxnVlA==",
 
13212
  "engines": {
13213
  "node": ">= 4.0.0"
13214
  }
package.json CHANGED
@@ -1,15 +1,84 @@
1
  {
2
- "name": "reader",
3
- "version": "1.0.0",
4
- "description": "### Prerequisite - Node v18 (The build fails for Node version >18) - Yarn - Firebase CLI (`npm install -g firebase-tools`)",
5
- "main": "index.js",
6
- "scripts": {
7
- "test": "echo \"Error: no test specified\" && exit 1"
8
- },
9
- "author": "",
10
- "license": "ISC",
11
- "devDependencies": {
12
- "firebase-tools": "^13.6.2",
13
- "typescript": "^5.1.6"
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "name": "reader",
3
+ "scripts": {
4
+ "lint": "eslint --ext .js,.ts .",
5
+ "build": "node ./integrity-check.cjs && tsc -p .",
6
+ "build:watch": "tsc --watch",
7
+ "build:clean": "rm -rf ./build",
8
+ "serve": "npm run build && npm run start",
9
+ "debug": "npm run build && npm run dev",
10
+ "start": "npm run shell"
11
+ },
12
+ "engines": {
13
+ "node": "20"
14
+ },
15
+ "main": "build/index.js",
16
+ "dependencies": {
17
+ "@esm2cjs/normalize-url": "^8.0.0",
18
+ "@google-cloud/translate": "^8.2.0",
19
+ "@koa/bodyparser": "^5.1.1",
20
+ "@mozilla/readability": "^0.5.0",
21
+ "@napi-rs/canvas": "^0.1.68",
22
+ "@types/turndown": "^5.0.4",
23
+ "@xmldom/xmldom": "^0.9.3",
24
+ "archiver": "^6.0.1",
25
+ "axios": "^1.3.3",
26
+ "bcrypt": "^5.1.0",
27
+ "busboy": "^1.6.0",
28
+ "civkit": "^0.8.4-32482a3",
29
+ "core-js": "^3.37.1",
30
+ "cors": "^2.8.5",
31
+ "dayjs": "^1.11.9",
32
+ "express": "^4.19.2",
33
+ "firebase-admin": "^12.1.0",
34
+ "firebase-functions": "^6.1.1",
35
+ "htmlparser2": "^9.0.0",
36
+ "jose": "^5.1.0",
37
+ "langdetect": "^0.2.1",
38
+ "linkedom": "^0.18.4",
39
+ "maxmind": "^4.3.18",
40
+ "minio": "^7.1.3",
41
+ "node-libcurl": "^4.1.0",
42
+ "openai": "^4.20.0",
43
+ "pdfjs-dist": "^4.10.38",
44
+ "puppeteer": "^23.3.0",
45
+ "puppeteer-extra": "^3.3.6",
46
+ "puppeteer-extra-plugin-block-resources": "^2.4.3",
47
+ "puppeteer-extra-plugin-page-proxy": "^1.3.1",
48
+ "puppeteer-page-proxy": "^1.3.0",
49
+ "robots-parser": "^3.0.1",
50
+ "set-cookie-parser": "^2.6.0",
51
+ "simple-zstd": "^1.4.2",
52
+ "stripe": "^11.11.0",
53
+ "tiktoken": "^1.0.16",
54
+ "tld-extract": "^2.1.0",
55
+ "turndown": "^7.1.3",
56
+ "turndown-plugin-gfm": "^1.0.2",
57
+ "undici": "^5.24.0"
58
+ },
59
+ "devDependencies": {
60
+ "@types/archiver": "^5.3.4",
61
+ "@types/bcrypt": "^5.0.0",
62
+ "@types/busboy": "^1.5.4",
63
+ "@types/cors": "^2.8.17",
64
+ "@types/generic-pool": "^3.8.1",
65
+ "@types/koa": "^2.15.0",
66
+ "@types/node": "^20.14.13",
67
+ "@types/set-cookie-parser": "^2.4.7",
68
+ "@types/xmldom": "^0.1.34",
69
+ "@typescript-eslint/eslint-plugin": "^5.12.0",
70
+ "@typescript-eslint/parser": "^5.12.0",
71
+ "eslint": "^8.9.0",
72
+ "eslint-config-google": "^0.14.0",
73
+ "eslint-plugin-import": "^2.25.4",
74
+ "firebase-functions-test": "^3.0.0",
75
+ "koa": "^2.16.0",
76
+ "pino-pretty": "^13.0.0",
77
+ "replicate": "^0.16.1",
78
+ "typescript": "^5.5.4"
79
+ },
80
+ "private": true,
81
+ "exports": {
82
+ ".": "./build/index.js"
83
+ }
84
+ }
{backend/functions/public β†’ public}/favicon.ico RENAMED
File without changes
{backend/functions/src/cloud-functions β†’ src/api}/crawler.ts RENAMED
@@ -1,30 +1,45 @@
1
- import {
2
- assignTransferProtocolMeta, marshalErrorLike,
3
- RPCHost, RPCReflection,
4
- AssertionFailureError, ParamValidationError, Defer,
5
- } from 'civkit';
6
  import { singleton } from 'tsyringe';
7
- import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
8
- import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
9
- import _ from 'lodash';
10
- import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
11
- import { Request, Response } from 'express';
12
- const pNormalizeUrl = import("@esm2cjs/normalize-url");
13
- import { Crawled } from '../db/crawled';
14
  import { randomUUID } from 'crypto';
15
- import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
16
 
17
- import { countGPTToken as estimateToken } from '../shared/utils/openai';
18
- import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
19
- import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 
 
 
 
 
 
 
 
 
 
20
  import { DomainBlockade } from '../db/domain-blockade';
21
  import { DomainProfile } from '../db/domain-profile';
22
- import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
 
 
23
  import { JSDomControl } from '../services/jsdom';
24
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
25
  import { CurlControl } from '../services/curl';
26
  import { LmControl } from '../services/lm';
27
  import { tryDecodeURIComponent } from '../utils/misc';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  export interface ExtraScrappingOptions extends ScrappingOptions {
30
  withIframe?: boolean | 'quoted';
@@ -33,6 +48,8 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
33
  removeSelector?: string | string[];
34
  keepImgDataUrl?: boolean;
35
  engine?: string;
 
 
36
  }
37
 
38
  const indexProto = {
@@ -56,16 +73,18 @@ export class CrawlerHost extends RPCHost {
56
  domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
57
 
58
  constructor(
59
- protected globalLogger: Logger,
60
  protected puppeteerControl: PuppeteerControl,
61
  protected curlControl: CurlControl,
 
 
62
  protected lmControl: LmControl,
63
  protected jsdomControl: JSDomControl,
64
  protected snapshotFormatter: SnapshotFormatter,
65
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
66
  protected rateLimitControl: RateLimitControl,
67
- protected threadLocal: AsyncContext,
68
- protected fbHealthCheck: FirebaseRoundTripChecker,
69
  ) {
70
  super(...arguments);
71
 
@@ -73,7 +92,7 @@ export class CrawlerHost extends RPCHost {
73
  if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
74
  return;
75
  }
76
- if (options.cookies?.length) {
77
  // Potential privacy issue, dont cache if cookies are used
78
  return;
79
  }
@@ -84,9 +103,14 @@ export class CrawlerHost extends RPCHost {
84
  if (options.locale) {
85
  Reflect.set(snapshot, 'locale', options.locale);
86
  }
87
- await this.setToCache(options.url, snapshot);
88
 
89
- await this.exploreDirectEngine(snapshot).catch(() => undefined);
 
 
 
 
 
 
90
  });
91
 
92
  puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@@ -108,12 +132,19 @@ export class CrawlerHost extends RPCHost {
108
  override async init() {
109
  await this.dependencyReady();
110
 
 
 
111
  this.emit('ready');
112
  }
113
 
114
- getIndex(user?: JinaEmbeddingsTokenAccount) {
115
  const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
116
-
 
 
 
 
 
117
  Object.assign(indexObject, {
118
  usage1: 'https://r.jina.ai/YOUR_URL',
119
  usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
@@ -121,71 +152,83 @@ export class CrawlerHost extends RPCHost {
121
  sourceCode: 'https://github.com/jina-ai/reader',
122
  });
123
 
124
- if (user) {
 
125
  indexObject[''] = undefined;
126
- indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
127
- indexObject.balanceLeft = user.wallet.total_balance;
128
  }
129
 
130
  return indexObject;
131
  }
132
 
133
- @CloudHTTPv2({
134
- name: 'crawl2',
135
- runtime: {
136
- memory: '4GiB',
137
- timeoutSeconds: 300,
138
- concurrency: 22,
 
 
139
  },
140
- tags: ['Crawler'],
141
- httpMethod: ['get', 'post'],
142
- returnType: [String, OutputServerEventStream],
143
- exposeRoot: true,
144
  })
145
- @CloudHTTPv2({
146
- runtime: {
147
- memory: '4GiB',
148
- cpu: 2,
149
- timeoutSeconds: 300,
150
- concurrency: 10,
151
- maxInstances: 1000,
152
- minInstances: 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  },
154
- tags: ['Crawler'],
155
- httpMethod: ['get', 'post'],
156
  returnType: [String, OutputServerEventStream],
157
- exposeRoot: true,
 
 
 
 
 
 
 
 
 
 
158
  })
159
  async crawl(
160
  @RPCReflect() rpcReflect: RPCReflection,
161
- @Ctx() ctx: {
162
- req: Request,
163
- res: Response,
164
- },
165
  auth: JinaEmbeddingsAuthDTO,
166
  crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
167
  crawlerOptionsParamsAllowed: CrawlerOptions,
168
  ) {
169
  const uid = await auth.solveUID();
170
  let chargeAmount = 0;
171
- const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
172
 
173
- // Note req.url in express is actually unparsed `path`, e.g. `/some-path?abc`. Instead of a real url.
174
- const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.req.url), crawlerOptions);
175
  if (!targetUrl) {
176
- const latestUser = uid ? await auth.assertUser() : undefined;
177
- if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
178
- return this.getIndex(latestUser);
179
- }
180
-
181
- return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
182
- { contentType: 'text/plain', envelope: null }
183
- );
184
  }
185
 
186
  // Prevent circular crawling
187
  this.puppeteerControl.circuitBreakerHosts.add(
188
- ctx.req.hostname.toLowerCase()
189
  );
190
 
191
  if (uid) {
@@ -222,8 +265,8 @@ export class CrawlerHost extends RPCHost {
222
  apiRoll.chargeAmount = chargeAmount;
223
  }
224
  });
225
- } else if (ctx.req.ip) {
226
- const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, [rpcReflect.name.toUpperCase()],
227
  [
228
  // 20 requests per minute
229
  new Date(Date.now() - 60 * 1000), 20
@@ -254,9 +297,12 @@ export class CrawlerHost extends RPCHost {
254
  }
255
  }
256
 
 
 
 
257
 
258
  const crawlOpts = await this.configure(crawlerOptions);
259
- if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
260
  const sseStream = new OutputServerEventStream();
261
  rpcReflect.return(sseStream);
262
 
@@ -265,8 +311,11 @@ export class CrawlerHost extends RPCHost {
265
  if (!scrapped) {
266
  continue;
267
  }
 
 
 
268
 
269
- const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
270
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
271
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
272
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -293,17 +342,20 @@ export class CrawlerHost extends RPCHost {
293
  }
294
 
295
  let lastScrapped;
296
- if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
297
  for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
298
  lastScrapped = scrapped;
 
 
 
299
  if (!crawlerOptions.isEarlyReturnApplicable()) {
300
  continue;
301
  }
302
- if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
303
  continue;
304
  }
305
 
306
- const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
307
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
308
 
309
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
@@ -324,7 +376,7 @@ export class CrawlerHost extends RPCHost {
324
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
325
  }
326
 
327
- const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
328
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
329
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
330
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -342,16 +394,18 @@ export class CrawlerHost extends RPCHost {
342
 
343
  for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
344
  lastScrapped = scrapped;
345
-
 
 
346
  if (!crawlerOptions.isEarlyReturnApplicable()) {
347
  continue;
348
  }
349
 
350
- if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
351
  continue;
352
  }
353
 
354
- const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
355
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
356
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
357
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -370,7 +424,7 @@ export class CrawlerHost extends RPCHost {
370
  );
371
  }
372
 
373
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
374
  }
375
 
376
  if (!lastScrapped) {
@@ -380,7 +434,7 @@ export class CrawlerHost extends RPCHost {
380
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
381
  }
382
 
383
- const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
384
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
385
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
386
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@@ -399,7 +453,7 @@ export class CrawlerHost extends RPCHost {
399
  );
400
  }
401
 
402
- return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
403
 
404
  }
405
 
@@ -419,7 +473,7 @@ export class CrawlerHost extends RPCHost {
419
  }
420
 
421
  let result: URL;
422
- const normalizeUrl = (await pNormalizeUrl).default;
423
  try {
424
  result = new URL(
425
  normalizeUrl(
@@ -638,7 +692,25 @@ export class CrawlerHost extends RPCHost {
638
  }
639
 
640
  if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
641
- yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
  return;
643
  }
644
 
@@ -653,27 +725,69 @@ export class CrawlerHost extends RPCHost {
653
  (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
654
  (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
655
  ) {
 
 
 
656
  yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
657
 
658
  return;
659
  }
660
 
661
- if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
662
- const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
663
- const domainProfile = await DomainProfile.fromFirestore(digest);
664
- if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
665
- try {
666
- const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
667
-
668
- // Expect downstream code to "break" here if it's satisfied with the direct engine
669
- yield snapshot;
670
- if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
671
- return;
672
  }
673
- } catch (err: any) {
674
- this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  }
676
  }
 
 
 
 
 
677
  }
678
 
679
  try {
@@ -782,6 +896,8 @@ export class CrawlerHost extends RPCHost {
782
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
783
  this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
784
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
 
 
785
  this.threadLocal.set('userAgent', opts.userAgent);
786
  if (opts.timeout) {
787
  this.threadLocal.set('timeout', opts.timeout * 1000);
@@ -804,6 +920,9 @@ export class CrawlerHost extends RPCHost {
804
  referer: opts.referer,
805
  viewport: opts.viewport,
806
  engine: opts.engine,
 
 
 
807
  };
808
 
809
  if (opts.locale) {
@@ -842,14 +961,15 @@ export class CrawlerHost extends RPCHost {
842
  return crawlOpts;
843
  }
844
 
845
- formatSnapshot(
846
  crawlerOptions: CrawlerOptions,
847
  snapshot: PageSnapshot & {
848
  screenshotUrl?: string;
849
  pageshotUrl?: string;
850
  },
851
  nominalUrl?: URL,
852
- urlValidMs?: number
 
853
  ) {
854
  const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;
855
 
@@ -870,7 +990,29 @@ export class CrawlerHost extends RPCHost {
870
  return output;
871
  }
872
 
873
- return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
874
  }
875
 
876
  async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
@@ -967,6 +1109,26 @@ export class CrawlerHost extends RPCHost {
967
  return;
968
  }
969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
  getDomainProfileUrlDigest(url: URL) {
971
  const pathname = url.pathname;
972
  const pathVec = pathname.split('/');
@@ -981,4 +1143,29 @@ export class CrawlerHost extends RPCHost {
981
  path: finalPath,
982
  };
983
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
  }
 
 
 
 
 
 
1
  import { singleton } from 'tsyringe';
2
+ import { pathToFileURL } from 'url';
 
 
 
 
 
 
3
  import { randomUUID } from 'crypto';
4
+ import _ from 'lodash';
5
 
6
+ import {
7
+ assignTransferProtocolMeta, RPCHost, RPCReflection,
8
+ AssertionFailureError, ParamValidationError,
9
+ RawString,
10
+ ApplicationError,
11
+ } from 'civkit/civ-rpc';
12
+ import { marshalErrorLike } from 'civkit/lang';
13
+ import { Defer } from 'civkit/defer';
14
+ import { retryWith } from 'civkit/decorators';
15
+
16
+ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
17
+
18
+ import { Crawled } from '../db/crawled';
19
  import { DomainBlockade } from '../db/domain-blockade';
20
  import { DomainProfile } from '../db/domain-profile';
21
+ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
22
+
23
+ import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
24
  import { JSDomControl } from '../services/jsdom';
25
  import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
26
  import { CurlControl } from '../services/curl';
27
  import { LmControl } from '../services/lm';
28
  import { tryDecodeURIComponent } from '../utils/misc';
29
+ import { CFBrowserRendering } from '../services/cf-browser-rendering';
30
+
31
+ import { GlobalLogger } from '../services/logger';
32
+ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
33
+ import { AsyncLocalContext } from '../services/async-context';
34
+ import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
35
+ import { BudgetExceededError, InsufficientBalanceError, SecurityCompromiseError } from '../services/errors';
36
+
37
+ import { countGPTToken as estimateToken } from '../shared/utils/openai';
38
+ import { ProxyProvider } from '../shared/services/proxy-provider';
39
+ import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
40
+ import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
41
+ import { RobotsTxtService } from '../services/robots-text';
42
+ import { ServiceBadAttemptError } from '../shared/lib/errors';
43
 
44
  export interface ExtraScrappingOptions extends ScrappingOptions {
45
  withIframe?: boolean | 'quoted';
 
48
  removeSelector?: string | string[];
49
  keepImgDataUrl?: boolean;
50
  engine?: string;
51
+ allocProxy?: string;
52
+ private?: boolean;
53
  }
54
 
55
  const indexProto = {
 
73
  domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
74
 
75
  constructor(
76
+ protected globalLogger: GlobalLogger,
77
  protected puppeteerControl: PuppeteerControl,
78
  protected curlControl: CurlControl,
79
+ protected cfBrowserRendering: CFBrowserRendering,
80
+ protected proxyProvider: ProxyProvider,
81
  protected lmControl: LmControl,
82
  protected jsdomControl: JSDomControl,
83
  protected snapshotFormatter: SnapshotFormatter,
84
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
85
  protected rateLimitControl: RateLimitControl,
86
+ protected threadLocal: AsyncLocalContext,
87
+ protected robotsTxtService: RobotsTxtService,
88
  ) {
89
  super(...arguments);
90
 
 
92
  if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
93
  return;
94
  }
95
+ if (options.cookies?.length || options.private) {
96
  // Potential privacy issue, dont cache if cookies are used
97
  return;
98
  }
 
103
  if (options.locale) {
104
  Reflect.set(snapshot, 'locale', options.locale);
105
  }
 
106
 
107
+ const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
108
+ if (analyzed.tokens < 200) {
109
+ // Does not contain enough content
110
+ return;
111
+ }
112
+
113
+ await this.setToCache(options.url, snapshot);
114
  });
115
 
116
  puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
 
132
  override async init() {
133
  await this.dependencyReady();
134
 
135
+ this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, ''));
136
+
137
  this.emit('ready');
138
  }
139
 
140
+ async getIndex(auth?: JinaEmbeddingsAuthDTO) {
141
  const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
142
+ // Object.assign(indexObject, {
143
+ // usage1: `${ctx.origin}/YOUR_URL`,
144
+ // usage2: `${ctx.origin}/search/YOUR_SEARCH_QUERY`,
145
+ // homepage: 'https://jina.ai/reader',
146
+ // sourceCode: 'https://github.com/jina-ai/reader',
147
+ // });
148
  Object.assign(indexObject, {
149
  usage1: 'https://r.jina.ai/YOUR_URL',
150
  usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
 
152
  sourceCode: 'https://github.com/jina-ai/reader',
153
  });
154
 
155
+ await auth?.solveUID();
156
+ if (auth && auth.user) {
157
  indexObject[''] = undefined;
158
+ indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
159
+ indexObject.balanceLeft = auth.user.wallet.total_balance;
160
  }
161
 
162
  return indexObject;
163
  }
164
 
165
+ @Method({
166
+ name: 'getIndex',
167
+ description: 'Index of the service',
168
+ proto: {
169
+ http: {
170
+ action: 'get',
171
+ path: '/',
172
+ }
173
  },
174
+ tags: ['misc', 'crawl'],
175
+ returnType: [String, Object],
 
 
176
  })
177
+ async getIndexCtrl(@Ctx() ctx: Context, @Param({ required: false }) auth?: JinaEmbeddingsAuthDTO) {
178
+ const indexObject = await this.getIndex(auth);
179
+
180
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
181
+ return indexObject;
182
+ }
183
+
184
+ return assignTransferProtocolMeta(`${indexObject}`,
185
+ { contentType: 'text/plain; charset=utf-8', envelope: null }
186
+ );
187
+ }
188
+
189
+
190
+ @Method({
191
+ name: 'crawlByPostingToIndex',
192
+ description: 'Crawl any url into markdown',
193
+ proto: {
194
+ http: {
195
+ action: 'POST',
196
+ path: '/',
197
+ }
198
  },
199
+ tags: ['crawl'],
 
200
  returnType: [String, OutputServerEventStream],
201
+ })
202
+ @Method({
203
+ description: 'Crawl any url into markdown',
204
+ proto: {
205
+ http: {
206
+ action: ['GET', 'POST'],
207
+ path: '::url',
208
+ }
209
+ },
210
+ tags: ['crawl'],
211
+ returnType: [String, OutputServerEventStream, RawString],
212
  })
213
  async crawl(
214
  @RPCReflect() rpcReflect: RPCReflection,
215
+ @Ctx() ctx: Context,
 
 
 
216
  auth: JinaEmbeddingsAuthDTO,
217
  crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
218
  crawlerOptionsParamsAllowed: CrawlerOptions,
219
  ) {
220
  const uid = await auth.solveUID();
221
  let chargeAmount = 0;
222
+ const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
223
 
224
+ const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.path), crawlerOptions);
 
225
  if (!targetUrl) {
226
+ return await this.getIndex(auth);
 
 
 
 
 
 
 
227
  }
228
 
229
  // Prevent circular crawling
230
  this.puppeteerControl.circuitBreakerHosts.add(
231
+ ctx.hostname.toLowerCase()
232
  );
233
 
234
  if (uid) {
 
265
  apiRoll.chargeAmount = chargeAmount;
266
  }
267
  });
268
+ } else if (ctx.ip) {
269
+ const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, [rpcReflect.name.toUpperCase()],
270
  [
271
  // 20 requests per minute
272
  new Date(Date.now() - 60 * 1000), 20
 
297
  }
298
  }
299
 
300
+ if (crawlerOptions.robotsTxt) {
301
+ await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
302
+ }
303
 
304
  const crawlOpts = await this.configure(crawlerOptions);
305
+ if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
306
  const sseStream = new OutputServerEventStream();
307
  rpcReflect.return(sseStream);
308
 
 
311
  if (!scrapped) {
312
  continue;
313
  }
314
+ if (rpcReflect.signal.aborted) {
315
+ break;
316
+ }
317
 
318
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
319
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
320
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
321
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
342
  }
343
 
344
  let lastScrapped;
345
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
346
  for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
347
  lastScrapped = scrapped;
348
+ if (rpcReflect.signal.aborted) {
349
+ break;
350
+ }
351
  if (!crawlerOptions.isEarlyReturnApplicable()) {
352
  continue;
353
  }
354
+ if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
355
  continue;
356
  }
357
 
358
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
359
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
360
 
361
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
 
376
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
377
  }
378
 
379
+ const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
380
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
381
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
382
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
394
 
395
  for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
396
  lastScrapped = scrapped;
397
+ if (rpcReflect.signal.aborted) {
398
+ break;
399
+ }
400
  if (!crawlerOptions.isEarlyReturnApplicable()) {
401
  continue;
402
  }
403
 
404
+ if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
405
  continue;
406
  }
407
 
408
+ const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
409
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
410
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
411
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
424
  );
425
  }
426
 
427
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
428
  }
429
 
430
  if (!lastScrapped) {
 
434
  throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
435
  }
436
 
437
+ const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
438
  chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
439
  if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
440
  throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
 
453
  );
454
  }
455
 
456
+ return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
457
 
458
  }
459
 
 
473
  }
474
 
475
  let result: URL;
476
+ const normalizeUrl = require('@esm2cjs/normalize-url').default;
477
  try {
478
  result = new URL(
479
  normalizeUrl(
 
692
  }
693
 
694
  if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
695
+ const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
696
+ await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
697
+ await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
698
+ if (!sideLoaded.file) {
699
+ throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
700
+ }
701
+ const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
702
+ yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
703
+ return;
704
+ }
705
+ if (crawlOpts?.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) {
706
+ const html = await this.cfBrowserRendering.fetchContent(urlToCrawl.href);
707
+ const snapshot = {
708
+ href: urlToCrawl.toString(),
709
+ html,
710
+ title: '',
711
+ text: '',
712
+ } as PageSnapshot;
713
+ yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
714
  return;
715
  }
716
 
 
725
  (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
726
  (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
727
  ) {
728
+ if (cache.snapshot) {
729
+ cache.snapshot.isFromCache = true;
730
+ }
731
  yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
732
 
733
  return;
734
  }
735
 
736
+ try {
737
+ const altOpts = { ...crawlOpts };
738
+ let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
739
+ await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
740
+ await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
741
+ this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
742
+
743
+ if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
744
+ return Promise.reject(err);
 
 
745
  }
746
+
747
+ return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
748
+ });
749
+ if (!sideLoaded.file) {
750
+ throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
751
+ }
752
+ let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
753
+ if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
754
+ yield draftSnapshot;
755
+ return;
756
+ }
757
+
758
+ let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
759
+ draftSnapshot.title ??= analyzed.title;
760
+ let fallbackProxyIsUsed = false;
761
+ if ((!crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200)) {
762
+ const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
763
+ if (!proxyLoaded.file) {
764
+ throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
765
+ }
766
+ const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName);
767
+ analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
768
+ if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
769
+ draftSnapshot = proxySnapshot;
770
+ sideLoaded = proxyLoaded;
771
+ fallbackProxyIsUsed = true;
772
+ }
773
+ }
774
+
775
+ if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
776
+ yield draftSnapshot;
777
+ }
778
+
779
+ if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
780
+ this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
781
+ crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
782
+ if (fallbackProxyIsUsed) {
783
+ this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
784
  }
785
  }
786
+ } catch (err: any) {
787
+ this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
788
+ if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
789
+ throw err;
790
+ }
791
  }
792
 
793
  try {
 
896
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
897
  this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
898
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
899
+ this.threadLocal.set('withIframe', opts.withIframe);
900
+ this.threadLocal.set('withShadowDom', opts.withShadowDom);
901
  this.threadLocal.set('userAgent', opts.userAgent);
902
  if (opts.timeout) {
903
  this.threadLocal.set('timeout', opts.timeout * 1000);
 
920
  referer: opts.referer,
921
  viewport: opts.viewport,
922
  engine: opts.engine,
923
+ allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
924
+ proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
925
+ private: Boolean(opts.doNotTrack),
926
  };
927
 
928
  if (opts.locale) {
 
961
  return crawlOpts;
962
  }
963
 
964
+ protected async formatSnapshot(
965
  crawlerOptions: CrawlerOptions,
966
  snapshot: PageSnapshot & {
967
  screenshotUrl?: string;
968
  pageshotUrl?: string;
969
  },
970
  nominalUrl?: URL,
971
+ urlValidMs?: number,
972
+ scrappingOptions?: ScrappingOptions
973
  ) {
974
  const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;
975
 
 
990
  return output;
991
  }
992
 
993
+ return this.formatSnapshotWithPDFSideLoad(respondWith, snapshot, presumedURL, urlValidMs, scrappingOptions);
994
+ }
995
+
996
+ async formatSnapshotWithPDFSideLoad(mode: string, snapshot: PageSnapshot, nominalUrl?: URL, urlValidMs?: number, scrappingOptions?: ScrappingOptions) {
997
+ const snapshotCopy = _.cloneDeep(snapshot);
998
+
999
+ if (snapshotCopy.pdfs?.length) {
1000
+ const pdfUrl = snapshotCopy.pdfs[0];
1001
+ if (pdfUrl.startsWith('http')) {
1002
+ const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
1003
+ if (sideLoaded?.body) {
1004
+ snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
1005
+ return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
1006
+ }
1007
+
1008
+ const r = await this.curlControl.sideLoad(new URL(pdfUrl), scrappingOptions);
1009
+ if (r.file) {
1010
+ snapshotCopy.pdfs[0] = pathToFileURL(await r.file.filePath).href;
1011
+ }
1012
+ }
1013
+ }
1014
+
1015
+ return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
1016
  }
1017
 
1018
  async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
 
1109
  return;
1110
  }
1111
 
1112
+ async snapshotNotGoodEnough(snapshot: PageSnapshot) {
1113
+ if (snapshot.pdfs?.length) {
1114
+ return false;
1115
+ }
1116
+ if (!snapshot.title) {
1117
+ return true;
1118
+ }
1119
+ if (snapshot.parsed?.content) {
1120
+ return false;
1121
+ }
1122
+ if (snapshot.html) {
1123
+ const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
1124
+ const tokens = r.tokens;
1125
+ if (tokens < 200) {
1126
+ return true;
1127
+ }
1128
+ }
1129
+ return false;
1130
+ }
1131
+
1132
  getDomainProfileUrlDigest(url: URL) {
1133
  const pathname = url.pathname;
1134
  const pathVec = pathname.split('/');
 
1143
  path: finalPath,
1144
  };
1145
  }
1146
+
1147
+ @retryWith((err) => {
1148
+ if (err instanceof ServiceBadAttemptError) {
1149
+ // Keep trying
1150
+ return true;
1151
+ }
1152
+ if (err instanceof ApplicationError) {
1153
+ // Quit with this error
1154
+ return false;
1155
+ }
1156
+ return undefined;
1157
+ }, 3)
1158
+ async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) {
1159
+ const proxy = await this.proxyProvider.alloc(opts?.allocProxy);
1160
+ const r = await this.curlControl.sideLoad(url, {
1161
+ ...opts,
1162
+ proxyUrl: proxy.href,
1163
+ });
1164
+
1165
+ if (opts && opts.allocProxy) {
1166
+ opts.proxyUrl ??= proxy.href;
1167
+ }
1168
+
1169
+ return { ...r, proxy };
1170
+ }
1171
  }
{backend/functions/src/cloud-functions β†’ src/api}/searcher-serper.ts RENAMED
@@ -1,21 +1,25 @@
1
- import {
2
- assignTransferProtocolMeta, marshalErrorLike,
3
- RPCHost, RPCReflection,
4
- AssertionFailureError,
5
- objHashMd5B64Of,
6
- assignMeta,
7
- } from 'civkit';
8
  import { singleton } from 'tsyringe';
9
- import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
10
- import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
 
 
 
11
  import _ from 'lodash';
12
- import { Request, Response } from 'express';
13
- import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
 
14
  import { CrawlerHost, ExtraScrappingOptions } from './crawler';
15
  import { SerperSearchResult } from '../db/searched';
16
- import { CrawlerOptions } from '../dto/scrapping-options';
17
  import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
18
  import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
 
 
 
 
 
 
 
19
  import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
20
 
21
  const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
@@ -33,9 +37,9 @@ export class SearcherHost extends RPCHost {
33
  targetResultCount = 5;
34
 
35
  constructor(
36
- protected globalLogger: Logger,
37
  protected rateLimitControl: RateLimitControl,
38
- protected threadLocal: AsyncContext,
39
  protected serperSearchService: SerperSearchService,
40
  protected crawler: CrawlerHost,
41
  protected snapshotFormatter: SnapshotFormatter,
@@ -49,39 +53,30 @@ export class SearcherHost extends RPCHost {
49
  this.emit('ready');
50
  }
51
 
52
- @CloudHTTPv2({
53
- name: 'search2',
54
- runtime: {
55
- cpu: 4,
56
- memory: '4GiB',
57
- timeoutSeconds: 300,
58
- concurrency: 4,
59
  },
60
- tags: ['Searcher'],
61
- httpMethod: ['get', 'post'],
62
  returnType: [String, OutputServerEventStream],
63
- exposeRoot: true,
64
  })
65
- @CloudHTTPv2({
66
- runtime: {
67
- cpu: 4,
68
- memory: '16GiB',
69
- timeoutSeconds: 300,
70
- concurrency: 4,
71
- maxInstances: 200,
72
- minInstances: 1,
73
  },
74
- tags: ['Searcher'],
75
- httpMethod: ['get', 'post'],
76
- returnType: [String, OutputServerEventStream],
77
- exposeRoot: true,
78
  })
79
  async search(
80
  @RPCReflect() rpcReflect: RPCReflection,
81
- @Ctx() ctx: {
82
- req: Request,
83
- res: Response,
84
- },
85
  auth: JinaEmbeddingsAuthDTO,
86
  crawlerOptions: CrawlerOptions,
87
  searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
@@ -102,19 +97,17 @@ export class SearcherHost extends RPCHost {
102
 
103
  const uid = await auth.solveUID();
104
  // Return content by default
105
- const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
106
- const crawlWithoutContent = respondWith.includes('no-content');
107
- const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
108
 
109
  let chargeAmount = 0;
110
- const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
111
  if (!noSlashPath && !q) {
112
- const latestUser = uid ? await auth.assertUser() : undefined;
113
- const index = this.crawler.getIndex(latestUser);
114
  if (!uid) {
115
  index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
116
  }
117
- if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
118
 
119
  return index;
120
  }
@@ -189,7 +182,7 @@ export class SearcherHost extends RPCHost {
189
  chargeAmount = 10000;
190
  }
191
  this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
192
- if ((!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) || count === 0) {
193
  return lastScrapped;
194
  }
195
  return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
@@ -201,7 +194,7 @@ export class SearcherHost extends RPCHost {
201
  withFavicon
202
  );
203
 
204
- if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
205
  const sseStream = new OutputServerEventStream();
206
  rpcReflect.return(sseStream);
207
 
@@ -210,6 +203,9 @@ export class SearcherHost extends RPCHost {
210
  if (!scrapped) {
211
  continue;
212
  }
 
 
 
213
 
214
  chargeAmount = this.assignChargeAmount(scrapped);
215
  sseStream.write({
@@ -233,7 +229,7 @@ export class SearcherHost extends RPCHost {
233
  }
234
 
235
  let earlyReturn = false;
236
- if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
237
  let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
238
  const setEarlyReturnTimer = () => {
239
  if (earlyReturnTimer) {
@@ -251,6 +247,9 @@ export class SearcherHost extends RPCHost {
251
 
252
  for await (const scrapped of it) {
253
  lastScrapped = scrapped;
 
 
 
254
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
255
  setEarlyReturnTimer();
256
  }
@@ -299,7 +298,9 @@ export class SearcherHost extends RPCHost {
299
 
300
  for await (const scrapped of it) {
301
  lastScrapped = scrapped;
302
-
 
 
303
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
304
  setEarlyReturnTimer();
305
  }
@@ -367,8 +368,8 @@ export class SearcherHost extends RPCHost {
367
  const dataItems = [
368
  { key: 'title', label: 'Title' },
369
  { key: 'url', label: 'URL Source' },
370
- { key: 'description', label: 'Description'},
371
- ]
372
 
373
  if (withContent) {
374
  result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
@@ -386,7 +387,7 @@ export class SearcherHost extends RPCHost {
386
  result.toString = function () {
387
  const self = this as any;
388
  return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
389
- }
390
  return result;
391
  }));
392
 
@@ -408,7 +409,6 @@ export class SearcherHost extends RPCHost {
408
  if (!searchResults) {
409
  return;
410
  }
411
-
412
  const urls = searchResults.map((x) => new URL(x.link));
413
  const snapshotMap = new WeakMap();
414
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
@@ -427,7 +427,7 @@ export class SearcherHost extends RPCHost {
427
  if (snapshotMap.has(x)) {
428
  return snapshotMap.get(x);
429
  }
430
- return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
431
  r.title ??= upstreamSearchResult.title;
432
  r.description = upstreamSearchResult.snippet;
433
  snapshotMap.set(x, r);
 
 
 
 
 
 
 
 
1
  import { singleton } from 'tsyringe';
2
+ import {
3
+ assignTransferProtocolMeta, RPCHost, RPCReflection, AssertionFailureError, assignMeta, RawString,
4
+ } from 'civkit/civ-rpc';
5
+ import { marshalErrorLike } from 'civkit/lang';
6
+ import { objHashMd5B64Of } from 'civkit/hash';
7
  import _ from 'lodash';
8
+
9
+ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
+
11
  import { CrawlerHost, ExtraScrappingOptions } from './crawler';
12
  import { SerperSearchResult } from '../db/searched';
13
+ import { CrawlerOptions } from '../dto/crawler-options';
14
  import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
15
  import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
16
+
17
+ import { GlobalLogger } from '../services/logger';
18
+ import { AsyncLocalContext } from '../services/async-context';
19
+ import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
20
+ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
21
+ import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
22
+ import { InsufficientBalanceError } from '../services/errors';
23
  import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
24
 
25
  const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
 
37
  targetResultCount = 5;
38
 
39
  constructor(
40
+ protected globalLogger: GlobalLogger,
41
  protected rateLimitControl: RateLimitControl,
42
+ protected threadLocal: AsyncLocalContext,
43
  protected serperSearchService: SerperSearchService,
44
  protected crawler: CrawlerHost,
45
  protected snapshotFormatter: SnapshotFormatter,
 
53
  this.emit('ready');
54
  }
55
 
56
+ @Method({
57
+ name: 'searchIndex',
58
+ ext: {
59
+ http: {
60
+ action: ['get', 'post'],
61
+ path: '/search'
62
+ }
63
  },
64
+ tags: ['search'],
 
65
  returnType: [String, OutputServerEventStream],
 
66
  })
67
+ @Method({
68
+ ext: {
69
+ http: {
70
+ action: ['get', 'post'],
71
+ path: '::q'
72
+ }
 
 
73
  },
74
+ tags: ['search'],
75
+ returnType: [String, OutputServerEventStream, RawString],
 
 
76
  })
77
  async search(
78
  @RPCReflect() rpcReflect: RPCReflection,
79
+ @Ctx() ctx: Context,
 
 
 
80
  auth: JinaEmbeddingsAuthDTO,
81
  crawlerOptions: CrawlerOptions,
82
  searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
 
97
 
98
  const uid = await auth.solveUID();
99
  // Return content by default
100
+ const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
101
+ const withFavicon = Boolean(ctx.get('X-With-Favicons'));
 
102
 
103
  let chargeAmount = 0;
104
+ const noSlashPath = decodeURIComponent(ctx.path).slice(1);
105
  if (!noSlashPath && !q) {
106
+ const index = await this.crawler.getIndex(auth);
 
107
  if (!uid) {
108
  index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
109
  }
110
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
111
 
112
  return index;
113
  }
 
182
  chargeAmount = 10000;
183
  }
184
  this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
185
+ if ((!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) || count === 0) {
186
  return lastScrapped;
187
  }
188
  return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
 
194
  withFavicon
195
  );
196
 
197
+ if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
198
  const sseStream = new OutputServerEventStream();
199
  rpcReflect.return(sseStream);
200
 
 
203
  if (!scrapped) {
204
  continue;
205
  }
206
+ if (rpcReflect.signal.aborted) {
207
+ break;
208
+ }
209
 
210
  chargeAmount = this.assignChargeAmount(scrapped);
211
  sseStream.write({
 
229
  }
230
 
231
  let earlyReturn = false;
232
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
233
  let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
234
  const setEarlyReturnTimer = () => {
235
  if (earlyReturnTimer) {
 
247
 
248
  for await (const scrapped of it) {
249
  lastScrapped = scrapped;
250
+ if (rpcReflect.signal.aborted) {
251
+ break;
252
+ }
253
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
254
  setEarlyReturnTimer();
255
  }
 
298
 
299
  for await (const scrapped of it) {
300
  lastScrapped = scrapped;
301
+ if (rpcReflect.signal.aborted) {
302
+ break;
303
+ }
304
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
305
  setEarlyReturnTimer();
306
  }
 
368
  const dataItems = [
369
  { key: 'title', label: 'Title' },
370
  { key: 'url', label: 'URL Source' },
371
+ { key: 'description', label: 'Description' },
372
+ ];
373
 
374
  if (withContent) {
375
  result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
 
387
  result.toString = function () {
388
  const self = this as any;
389
  return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
390
+ };
391
  return result;
392
  }));
393
 
 
409
  if (!searchResults) {
410
  return;
411
  }
 
412
  const urls = searchResults.map((x) => new URL(x.link));
413
  const snapshotMap = new WeakMap();
414
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
 
427
  if (snapshotMap.has(x)) {
428
  return snapshotMap.get(x);
429
  }
430
+ return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => {
431
  r.title ??= upstreamSearchResult.title;
432
  r.description = upstreamSearchResult.snippet;
433
  snapshotMap.set(x, r);
{backend/functions/src/cloud-functions β†’ src/api}/searcher.ts RENAMED
@@ -1,22 +1,30 @@
 
 
 
1
  import {
2
- assignTransferProtocolMeta, marshalErrorLike,
3
- RPCHost, RPCReflection,
4
  AssertionFailureError,
5
- objHashMd5B64Of,
6
- } from 'civkit';
7
- import { singleton } from 'tsyringe';
8
- import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
 
9
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
- import _ from 'lodash';
11
- import { Request, Response } from 'express';
12
- import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
13
- import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
14
- import { CrawlerHost, ExtraScrappingOptions } from './crawler';
15
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
 
 
16
  import { SearchResult } from '../db/searched';
17
- import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
18
- import { CrawlerOptions } from '../dto/scrapping-options';
 
 
19
  import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
 
 
 
 
 
20
 
21
 
22
  @singleton()
@@ -32,9 +40,9 @@ export class SearcherHost extends RPCHost {
32
  targetResultCount = 5;
33
 
34
  constructor(
35
- protected globalLogger: Logger,
36
  protected rateLimitControl: RateLimitControl,
37
- protected threadLocal: AsyncContext,
38
  protected braveSearchService: BraveSearchService,
39
  protected crawler: CrawlerHost,
40
  protected snapshotFormatter: SnapshotFormatter,
@@ -48,39 +56,30 @@ export class SearcherHost extends RPCHost {
48
  this.emit('ready');
49
  }
50
 
51
- @CloudHTTPv2({
52
- name: 'search2',
53
- runtime: {
54
- cpu: 4,
55
- memory: '4GiB',
56
- timeoutSeconds: 300,
57
- concurrency: 4,
58
  },
59
- tags: ['Searcher'],
60
- httpMethod: ['get', 'post'],
61
  returnType: [String, OutputServerEventStream],
62
- exposeRoot: true,
63
  })
64
- @CloudHTTPv2({
65
- runtime: {
66
- cpu: 4,
67
- memory: '16GiB',
68
- timeoutSeconds: 300,
69
- concurrency: 4,
70
- maxInstances: 200,
71
- minInstances: 1,
72
  },
73
- tags: ['Searcher'],
74
- httpMethod: ['get', 'post'],
75
- returnType: [String, OutputServerEventStream],
76
- exposeRoot: true,
77
  })
78
  async search(
79
  @RPCReflect() rpcReflect: RPCReflection,
80
- @Ctx() ctx: {
81
- req: Request,
82
- res: Response,
83
- },
84
  auth: JinaEmbeddingsAuthDTO,
85
  @Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
86
  count: number,
@@ -90,14 +89,13 @@ export class SearcherHost extends RPCHost {
90
  ) {
91
  const uid = await auth.solveUID();
92
  let chargeAmount = 0;
93
- const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
94
  if (!noSlashPath && !q) {
95
- const latestUser = uid ? await auth.assertUser() : undefined;
96
- const index = this.crawler.getIndex(latestUser);
97
  if (!uid) {
98
  index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
99
  }
100
- if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
101
 
102
  return index;
103
  }
@@ -160,7 +158,7 @@ export class SearcherHost extends RPCHost {
160
  count,
161
  );
162
 
163
- if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
164
  const sseStream = new OutputServerEventStream();
165
  rpcReflect.return(sseStream);
166
 
@@ -193,7 +191,7 @@ export class SearcherHost extends RPCHost {
193
 
194
  let lastScrapped: any[] | undefined;
195
  let earlyReturn = false;
196
- if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
197
  let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
198
  const setEarlyReturnTimer = () => {
199
  if (earlyReturnTimer) {
 
1
+ import { singleton } from 'tsyringe';
2
+ import _ from 'lodash';
3
+
4
  import {
5
+ assignTransferProtocolMeta, RPCHost, RPCReflection,
 
6
  AssertionFailureError,
7
+ RawString,
8
+ } from 'civkit/civ-rpc';
9
+ import { marshalErrorLike } from 'civkit/lang';
10
+ import { objHashMd5B64Of } from 'civkit/hash';
11
+
12
  import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
13
+ import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
 
 
 
 
14
  import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
15
+
16
+ import { CrawlerHost, ExtraScrappingOptions } from './crawler';
17
  import { SearchResult } from '../db/searched';
18
+ import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
19
+ import { CrawlerOptions } from '../dto/crawler-options';
20
+ import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
21
+
22
  import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
23
+ import { GlobalLogger } from '../services/logger';
24
+ import { AsyncLocalContext } from '../services/async-context';
25
+ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
26
+ import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
27
+ import { InsufficientBalanceError } from '../services/errors';
28
 
29
 
30
  @singleton()
 
40
  targetResultCount = 5;
41
 
42
  constructor(
43
+ protected globalLogger: GlobalLogger,
44
  protected rateLimitControl: RateLimitControl,
45
+ protected threadLocal: AsyncLocalContext,
46
  protected braveSearchService: BraveSearchService,
47
  protected crawler: CrawlerHost,
48
  protected snapshotFormatter: SnapshotFormatter,
 
56
  this.emit('ready');
57
  }
58
 
59
+ @Method({
60
+ name: 'searchIndex',
61
+ ext: {
62
+ http: {
63
+ action: ['get', 'post'],
64
+ path: '/search'
65
+ }
66
  },
67
+ tags: ['search'],
 
68
  returnType: [String, OutputServerEventStream],
 
69
  })
70
+ @Method({
71
+ ext: {
72
+ http: {
73
+ action: ['get', 'post'],
74
+ path: '::q'
75
+ }
 
 
76
  },
77
+ tags: ['search'],
78
+ returnType: [String, OutputServerEventStream, RawString],
 
 
79
  })
80
  async search(
81
  @RPCReflect() rpcReflect: RPCReflection,
82
+ @Ctx() ctx: Context,
 
 
 
83
  auth: JinaEmbeddingsAuthDTO,
84
  @Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
85
  count: number,
 
89
  ) {
90
  const uid = await auth.solveUID();
91
  let chargeAmount = 0;
92
+ const noSlashPath = decodeURIComponent(ctx.path).slice(1);
93
  if (!noSlashPath && !q) {
94
+ const index = await this.crawler.getIndex(auth);
 
95
  if (!uid) {
96
  index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
97
  }
98
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
99
 
100
  return index;
101
  }
 
158
  count,
159
  );
160
 
161
+ if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
162
  const sseStream = new OutputServerEventStream();
163
  rpcReflect.return(sseStream);
164
 
 
191
 
192
  let lastScrapped: any[] | undefined;
193
  let earlyReturn = false;
194
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
195
  let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
196
  const setEarlyReturnTimer = () => {
197
  if (earlyReturnTimer) {
{backend/functions/src β†’ src}/cloud-functions/adaptive-crawler.ts RENAMED
@@ -14,7 +14,7 @@ import robotsParser from 'robots-parser';
14
  import { DOMParser } from '@xmldom/xmldom';
15
 
16
  import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
17
- import { CrawlerOptions } from '../dto/scrapping-options';
18
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
19
  import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
20
  import { getFunctions } from 'firebase-admin/functions';
 
14
  import { DOMParser } from '@xmldom/xmldom';
15
 
16
  import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
17
+ import { CrawlerOptions } from '../dto/crawler-options';
18
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
19
  import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
20
  import { getFunctions } from 'firebase-admin/functions';
{backend/functions/src β†’ src}/cloud-functions/data-crunching.ts RENAMED
@@ -9,7 +9,7 @@ import {
9
  FirebaseStorageBucketControl, Logger, Param, TempFileManager
10
  } from '../shared';
11
  import _ from 'lodash';
12
- import { CrawlerHost } from './crawler';
13
 
14
  import { Crawled } from '../db/crawled';
15
  import dayjs from 'dayjs';
 
9
  FirebaseStorageBucketControl, Logger, Param, TempFileManager
10
  } from '../shared';
11
  import _ from 'lodash';
12
+ import { CrawlerHost } from '../api/crawler';
13
 
14
  import { Crawled } from '../db/crawled';
15
  import dayjs from 'dayjs';
{backend/functions/src β†’ src}/db/adaptive-crawl-task.ts RENAMED
File without changes
{backend/functions/src β†’ src}/db/crawled.ts RENAMED
File without changes
{backend/functions/src β†’ src}/db/domain-blockade.ts RENAMED
File without changes
{backend/functions/src β†’ src}/db/domain-profile.ts RENAMED
@@ -1,6 +1,6 @@
1
  import { Also, Prop } from 'civkit';
2
  import { FirestoreRecord } from '../shared/lib/firestore';
3
- import { ENGINE_TYPE } from '../dto/scrapping-options';
4
 
5
  @Also({
6
  dictOf: Object
 
1
  import { Also, Prop } from 'civkit';
2
  import { FirestoreRecord } from '../shared/lib/firestore';
3
+ import { ENGINE_TYPE } from '../dto/crawler-options';
4
 
5
  @Also({
6
  dictOf: Object
{backend/functions/src β†’ src}/db/img-alt.ts RENAMED
File without changes
{backend/functions/src β†’ src}/db/pdf.ts RENAMED
File without changes
{backend/functions/src β†’ src}/db/searched.ts RENAMED
File without changes
{backend/functions/src β†’ src}/dto/adaptive-crawler-options.ts RENAMED
File without changes
backend/functions/src/dto/scrapping-options.ts β†’ src/dto/crawler-options.ts RENAMED
@@ -1,6 +1,6 @@
1
  import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
- import type { Request, Response } from 'express';
3
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
 
4
 
5
  export enum CONTENT_FORMAT {
6
  CONTENT = 'content',
@@ -19,6 +19,7 @@ export enum ENGINE_TYPE {
19
  DIRECT = 'direct',
20
  VLM = 'vlm',
21
  READER_LM = 'readerlm-v2',
 
22
  }
23
 
24
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
@@ -125,6 +126,11 @@ class Viewport extends AutoCastable {
125
  in: 'header',
126
  schema: { type: 'string' }
127
  },
 
 
 
 
 
128
  'X-Set-Cookie': {
129
  description: `Sets cookie(s) to the headless browser for your request. \n\n` +
130
  `Syntax is the same with standard Set-Cookie`,
@@ -297,6 +303,9 @@ export class CrawlerOptions extends AutoCastable {
297
  @Prop()
298
  proxyUrl?: string;
299
 
 
 
 
300
  @Prop()
301
  userAgent?: string;
302
 
@@ -338,15 +347,18 @@ export class CrawlerOptions extends AutoCastable {
338
  @Prop()
339
  jsonSchema?: object;
340
 
 
 
 
 
 
 
341
  static override from(input: any) {
342
  const instance = super.from(input) as CrawlerOptions;
343
- const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
344
- req: Request,
345
- res: Response,
346
- } | undefined;
347
 
348
- const customMode = ctx?.req.get('x-respond-with') || ctx?.req.get('x-return-format');
349
- if (customMode !== undefined) {
350
  instance.respondWith = customMode;
351
  }
352
  if (instance.respondWith) {
@@ -361,74 +373,74 @@ export class CrawlerOptions extends AutoCastable {
361
  }
362
  }
363
 
364
- const locale = ctx?.req.get('x-locale');
365
- if (locale !== undefined) {
366
  instance.locale = locale;
367
  }
368
 
369
- const referer = ctx?.req.get('x-referer');
370
- if (referer !== undefined) {
371
  instance.referer = referer;
372
  }
373
 
374
- const withGeneratedAlt = ctx?.req.get('x-with-generated-alt');
375
- if (withGeneratedAlt !== undefined) {
376
  instance.withGeneratedAlt = Boolean(withGeneratedAlt);
377
  }
378
- const withLinksSummary = ctx?.req.get('x-with-links-summary');
379
- if (withLinksSummary !== undefined) {
380
  if (withLinksSummary === 'all') {
381
  instance.withLinksSummary = withLinksSummary;
382
  } else {
383
  instance.withLinksSummary = Boolean(withLinksSummary);
384
  }
385
  }
386
- const withImagesSummary = ctx?.req.get('x-with-images-summary');
387
- if (withImagesSummary !== undefined) {
388
  instance.withImagesSummary = Boolean(withImagesSummary);
389
  }
390
- const retainImages = ctx?.req.get('x-retain-images');
391
  if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
392
  instance.retainImages = retainImages as any;
393
  }
394
  if (instance.withGeneratedAlt) {
395
  instance.retainImages = 'all_p';
396
  }
397
- const noCache = ctx?.req.get('x-no-cache');
398
- if (noCache !== undefined) {
399
  instance.noCache = Boolean(noCache);
400
  }
401
  if (instance.noCache && instance.cacheTolerance === undefined) {
402
  instance.cacheTolerance = 0;
403
  }
404
- let cacheTolerance = parseInt(ctx?.req.get('x-cache-tolerance') || '');
405
  if (!isNaN(cacheTolerance)) {
406
  instance.cacheTolerance = cacheTolerance;
407
  }
408
 
409
- const noGfm = ctx?.req.get('x-no-gfm');
410
  if (noGfm) {
411
  instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
412
  }
413
 
414
- let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
415
  if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
416
  instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
417
- } else if (ctx?.req.get('x-timeout')) {
418
  instance.timeout = null;
419
  }
420
 
421
- const removeSelector = ctx?.req.get('x-remove-selector')?.split(', ');
422
- instance.removeSelector ??= removeSelector;
423
- const targetSelector = ctx?.req.get('x-target-selector')?.split(', ');
424
- instance.targetSelector ??= targetSelector;
425
- const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
426
- instance.waitForSelector ??= waitForSelector || instance.targetSelector;
427
  instance.targetSelector = filterSelector(instance.targetSelector);
428
- const overrideUserAgent = ctx?.req.get('x-user-agent');
429
  instance.userAgent ??= overrideUserAgent;
430
 
431
- const engine = ctx?.req.get('x-engine');
432
  if (engine) {
433
  instance.engine = engine;
434
  }
@@ -443,18 +455,18 @@ export class CrawlerOptions extends AutoCastable {
443
  instance.respondWith = CONTENT_FORMAT.READER_LM;
444
  }
445
 
446
- const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
447
- if (keepImgDataUrl !== undefined) {
448
  instance.keepImgDataUrl = Boolean(keepImgDataUrl);
449
  }
450
- const withIframe = ctx?.req.get('x-with-iframe');
451
- if (withIframe !== undefined) {
452
  instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
453
  }
454
  if (instance.withIframe) {
455
  instance.timeout ??= null;
456
  }
457
- const withShadowDom = ctx?.req.get('x-with-shadow-dom');
458
  if (withShadowDom) {
459
  instance.withShadowDom = Boolean(withShadowDom);
460
  }
@@ -463,7 +475,7 @@ export class CrawlerOptions extends AutoCastable {
463
  }
464
 
465
  const cookies: Cookie[] = [];
466
- const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
467
  if (Array.isArray(setCookieHeaders)) {
468
  for (const setCookie of setCookieHeaders) {
469
  cookies.push({
@@ -477,21 +489,24 @@ export class CrawlerOptions extends AutoCastable {
477
  }
478
  instance.setCookies = cookies;
479
 
480
- const proxyUrl = ctx?.req.get('x-proxy-url');
481
- instance.proxyUrl ??= proxyUrl;
 
 
 
 
482
 
483
- if (instance.cacheTolerance) {
484
- instance.cacheTolerance = instance.cacheTolerance * 1000;
485
- }
486
-
487
- const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
488
  instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
489
 
490
- const baseMode = ctx?.req.get('x-base') || undefined;
491
  if (baseMode) {
492
  instance.base = baseMode as any;
493
  }
494
 
 
 
 
495
  if (instance.cacheTolerance) {
496
  instance.cacheTolerance = instance.cacheTolerance * 1000;
497
  }
 
1
  import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
 
2
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
3
+ import { Context } from '../services/registry';
4
 
5
  export enum CONTENT_FORMAT {
6
  CONTENT = 'content',
 
19
  DIRECT = 'direct',
20
  VLM = 'vlm',
21
  READER_LM = 'readerlm-v2',
22
+ CF_BROWSER_RENDERING = 'cf-browser-rendering',
23
  }
24
 
25
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
 
126
  in: 'header',
127
  schema: { type: 'string' }
128
  },
129
+ 'X-Proxy': {
130
+ description: `Use a proxy server provided by Jina AI.\n\nOptionally specify two-letter country code.`,
131
+ in: 'header',
132
+ schema: { type: 'string' }
133
+ },
134
  'X-Set-Cookie': {
135
  description: `Sets cookie(s) to the headless browser for your request. \n\n` +
136
  `Syntax is the same with standard Set-Cookie`,
 
303
  @Prop()
304
  proxyUrl?: string;
305
 
306
+ @Prop()
307
+ proxy?: string;
308
+
309
  @Prop()
310
  userAgent?: string;
311
 
 
347
  @Prop()
348
  jsonSchema?: object;
349
 
350
+ @Prop()
351
+ robotsTxt?: string;
352
+
353
+ @Prop()
354
+ doNotTrack?: number | null;
355
+
356
  static override from(input: any) {
357
  const instance = super.from(input) as CrawlerOptions;
358
+ const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
 
 
 
359
 
360
+ const customMode = ctx?.get('x-respond-with') || ctx?.get('x-return-format');
361
+ if (customMode) {
362
  instance.respondWith = customMode;
363
  }
364
  if (instance.respondWith) {
 
373
  }
374
  }
375
 
376
+ const locale = ctx?.get('x-locale');
377
+ if (locale) {
378
  instance.locale = locale;
379
  }
380
 
381
+ const referer = ctx?.get('x-referer');
382
+ if (referer) {
383
  instance.referer = referer;
384
  }
385
 
386
+ const withGeneratedAlt = ctx?.get('x-with-generated-alt');
387
+ if (withGeneratedAlt) {
388
  instance.withGeneratedAlt = Boolean(withGeneratedAlt);
389
  }
390
+ const withLinksSummary = ctx?.get('x-with-links-summary');
391
+ if (withLinksSummary) {
392
  if (withLinksSummary === 'all') {
393
  instance.withLinksSummary = withLinksSummary;
394
  } else {
395
  instance.withLinksSummary = Boolean(withLinksSummary);
396
  }
397
  }
398
+ const withImagesSummary = ctx?.get('x-with-images-summary');
399
+ if (withImagesSummary) {
400
  instance.withImagesSummary = Boolean(withImagesSummary);
401
  }
402
+ const retainImages = ctx?.get('x-retain-images');
403
  if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
404
  instance.retainImages = retainImages as any;
405
  }
406
  if (instance.withGeneratedAlt) {
407
  instance.retainImages = 'all_p';
408
  }
409
+ const noCache = ctx?.get('x-no-cache');
410
+ if (noCache) {
411
  instance.noCache = Boolean(noCache);
412
  }
413
  if (instance.noCache && instance.cacheTolerance === undefined) {
414
  instance.cacheTolerance = 0;
415
  }
416
+ let cacheTolerance = parseInt(ctx?.get('x-cache-tolerance') || '');
417
  if (!isNaN(cacheTolerance)) {
418
  instance.cacheTolerance = cacheTolerance;
419
  }
420
 
421
+ const noGfm = ctx?.get('x-no-gfm');
422
  if (noGfm) {
423
  instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
424
  }
425
 
426
+ let timeoutSeconds = parseInt(ctx?.get('x-timeout') || '');
427
  if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
428
  instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
429
+ } else if (ctx?.get('x-timeout')) {
430
  instance.timeout = null;
431
  }
432
 
433
+ const removeSelector = ctx?.get('x-remove-selector')?.split(', ').filter(Boolean);
434
+ instance.removeSelector ??= removeSelector?.length ? removeSelector : undefined;
435
+ const targetSelector = ctx?.get('x-target-selector')?.split(', ').filter(Boolean);
436
+ instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
437
+ const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
438
+ instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
439
  instance.targetSelector = filterSelector(instance.targetSelector);
440
+ const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
441
  instance.userAgent ??= overrideUserAgent;
442
 
443
+ const engine = ctx?.get('x-engine');
444
  if (engine) {
445
  instance.engine = engine;
446
  }
 
455
  instance.respondWith = CONTENT_FORMAT.READER_LM;
456
  }
457
 
458
+ const keepImgDataUrl = ctx?.get('x-keep-img-data-url');
459
+ if (keepImgDataUrl) {
460
  instance.keepImgDataUrl = Boolean(keepImgDataUrl);
461
  }
462
+ const withIframe = ctx?.get('x-with-iframe');
463
+ if (withIframe) {
464
  instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
465
  }
466
  if (instance.withIframe) {
467
  instance.timeout ??= null;
468
  }
469
+ const withShadowDom = ctx?.get('x-with-shadow-dom');
470
  if (withShadowDom) {
471
  instance.withShadowDom = Boolean(withShadowDom);
472
  }
 
475
  }
476
 
477
  const cookies: Cookie[] = [];
478
+ const setCookieHeaders = (ctx?.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[])).filter(Boolean);
479
  if (Array.isArray(setCookieHeaders)) {
480
  for (const setCookie of setCookieHeaders) {
481
  cookies.push({
 
489
  }
490
  instance.setCookies = cookies;
491
 
492
+ const proxyUrl = ctx?.get('x-proxy-url');
493
+ instance.proxyUrl ??= proxyUrl || undefined;
494
+ const proxy = ctx?.get('x-proxy');
495
+ instance.proxy ??= proxy || undefined;
496
+ const robotsTxt = ctx?.get('x-robots-txt');
497
+ instance.robotsTxt ??= robotsTxt || undefined;
498
 
499
+ const tokenBudget = ctx?.get('x-token-budget');
 
 
 
 
500
  instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
501
 
502
+ const baseMode = ctx?.get('x-base');
503
  if (baseMode) {
504
  instance.base = baseMode as any;
505
  }
506
 
507
+ const dnt = ctx?.get('dnt');
508
+ instance.doNotTrack ??= (parseInt(dnt || '') || null);
509
+
510
  if (instance.cacheTolerance) {
511
  instance.cacheTolerance = instance.cacheTolerance * 1000;
512
  }
src/dto/jina-embeddings-auth.ts ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import _ from 'lodash';
2
+ import {
3
+ Also, AuthenticationFailedError, AuthenticationRequiredError,
4
+ DownstreamServiceFailureError, RPC_CALL_ENVIRONMENT,
5
+ AutoCastable,
6
+ } from 'civkit/civ-rpc';
7
+ import { htmlEscape } from 'civkit/escape';
8
+ import { marshalErrorLike } from 'civkit/lang';
9
+
10
+ import type { Context } from 'koa';
11
+
12
+ import logger from '../services/logger';
13
+ import { InjectProperty } from '../services/registry';
14
+ import { AsyncLocalContext } from '../services/async-context';
15
+
16
+ import envConfig from '../shared/services/secrets';
17
+ import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings';
18
+ import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
19
+
20
+
21
+ const authDtoLogger = logger.child({ service: 'JinaAuthDTO' });
22
+
23
+ const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY);
24
+
25
+ @Also({
26
+ openapi: {
27
+ operation: {
28
+ parameters: {
29
+ 'Authorization': {
30
+ description: htmlEscape`Jina Token for authentication.\n\n` +
31
+ htmlEscape`- Member of <JinaEmbeddingsAuthDTO>\n\n` +
32
+ `- Authorization: Bearer {YOUR_JINA_TOKEN}`
33
+ ,
34
+ in: 'header',
35
+ schema: {
36
+ anyOf: [
37
+ { type: 'string', format: 'token' }
38
+ ]
39
+ }
40
+ }
41
+ }
42
+ }
43
+ }
44
+ })
45
+ export class JinaEmbeddingsAuthDTO extends AutoCastable {
46
+ uid?: string;
47
+ bearerToken?: string;
48
+ user?: JinaEmbeddingsTokenAccount;
49
+
50
+ @InjectProperty(AsyncLocalContext)
51
+ ctxMgr!: AsyncLocalContext;
52
+
53
+ jinaEmbeddingsDashboard = THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT;
54
+
55
+ static override from(input: any) {
56
+ const instance = super.from(input) as JinaEmbeddingsAuthDTO;
57
+
58
+ const ctx = input[RPC_CALL_ENVIRONMENT] as Context;
59
+
60
+ if (ctx) {
61
+ const authorization = ctx.get('authorization');
62
+
63
+ if (authorization) {
64
+ const authToken = authorization.split(' ')[1] || authorization;
65
+ instance.bearerToken = authToken;
66
+ }
67
+
68
+ }
69
+
70
+ if (!instance.bearerToken && input._token) {
71
+ instance.bearerToken = input._token;
72
+ }
73
+
74
+ return instance;
75
+ }
76
+
77
+ async getBrief(ignoreCache?: boolean | string) {
78
+ if (!this.bearerToken) {
79
+ throw new AuthenticationRequiredError({
80
+ message: 'Jina API key is required to authenticate. Please get one from https://jina.ai'
81
+ });
82
+ }
83
+
84
+ let account;
85
+ try {
86
+ account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
87
+ } catch (err) {
88
+ // FireStore would not accept any string as input and may throw if not happy with it
89
+ void 0;
90
+ }
91
+
92
+
93
+ const age = account?.lastSyncedAt ? Date.now() - account.lastSyncedAt.getTime() : Infinity;
94
+
95
+ if (account && !ignoreCache) {
96
+ if (account && age < 180_000) {
97
+ this.user = account;
98
+ this.uid = this.user?.user_id;
99
+
100
+ return account;
101
+ }
102
+ }
103
+
104
+ try {
105
+ const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
106
+ const brief = r.data;
107
+ const draftAccount = JinaEmbeddingsTokenAccount.from({
108
+ ...account, ...brief, _id: this.bearerToken,
109
+ lastSyncedAt: new Date()
110
+ });
111
+ await JinaEmbeddingsTokenAccount.save(draftAccount.degradeForFireStore(), undefined, { merge: true });
112
+
113
+ this.user = draftAccount;
114
+ this.uid = this.user?.user_id;
115
+
116
+ return draftAccount;
117
+ } catch (err: any) {
118
+ authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) });
119
+
120
+ if (err?.status === 401) {
121
+ throw new AuthenticationFailedError({
122
+ message: 'Invalid API key, please get a new one from https://jina.ai'
123
+ });
124
+ }
125
+
126
+ if (account) {
127
+ this.user = account;
128
+ this.uid = this.user?.user_id;
129
+
130
+ return account;
131
+ }
132
+
133
+
134
+ throw new DownstreamServiceFailureError(`Failed to authenticate: ${err}`);
135
+ }
136
+ }
137
+
138
+ async reportUsage(tokenCount: number, mdl: string, endpoint: string = '/encode') {
139
+ const user = await this.assertUser();
140
+ const uid = user.user_id;
141
+ user.wallet.total_balance -= tokenCount;
142
+
143
+ return this.jinaEmbeddingsDashboard.reportUsage(this.bearerToken!, {
144
+ model_name: mdl,
145
+ api_endpoint: endpoint,
146
+ consumer: {
147
+ id: uid,
148
+ user_id: uid,
149
+ },
150
+ usage: {
151
+ total_tokens: tokenCount
152
+ },
153
+ labels: {
154
+ model_name: mdl
155
+ }
156
+ }).then((r) => {
157
+ JinaEmbeddingsTokenAccount.COLLECTION.doc(this.bearerToken!)
158
+ .update({ 'wallet.total_balance': JinaEmbeddingsTokenAccount.OPS.increment(-tokenCount) })
159
+ .catch((err) => {
160
+ authDtoLogger.warn(`Failed to update cache for ${uid}: ${err}`, { err: marshalErrorLike(err) });
161
+ });
162
+
163
+ return r;
164
+ }).catch((err) => {
165
+ user.wallet.total_balance += tokenCount;
166
+ authDtoLogger.warn(`Failed to report usage for ${uid}: ${err}`, { err: marshalErrorLike(err) });
167
+ });
168
+ }
169
+
170
+ async solveUID() {
171
+ if (this.uid) {
172
+ this.ctxMgr.set('uid', this.uid);
173
+
174
+ return this.uid;
175
+ }
176
+
177
+ if (this.bearerToken) {
178
+ await this.getBrief();
179
+ this.ctxMgr.set('uid', this.uid);
180
+
181
+ return this.uid;
182
+ }
183
+
184
+ return undefined;
185
+ }
186
+
187
+ async assertUID() {
188
+ const uid = await this.solveUID();
189
+
190
+ if (!uid) {
191
+ throw new AuthenticationRequiredError('Authentication failed');
192
+ }
193
+
194
+ return uid;
195
+ }
196
+
197
+ async assertUser() {
198
+ if (this.user) {
199
+ return this.user;
200
+ }
201
+
202
+ await this.getBrief();
203
+
204
+ return this.user!;
205
+ }
206
+
207
+ getRateLimits(...tags: string[]) {
208
+ const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective());
209
+
210
+ if (descs.length) {
211
+ return descs;
212
+ }
213
+
214
+ return undefined;
215
+ }
216
+ }
{backend/functions/src β†’ src}/fetch.d.ts RENAMED
File without changes
{backend/functions/src β†’ src}/index.ts RENAMED
File without changes
src/lib/transform-server-event-stream.ts ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { TPM, parseJSONText } from 'civkit';
2
+ import { Transform, TransformCallback, TransformOptions } from 'stream';
3
+
4
+ export class InputServerEventStream extends Transform {
5
+ cache: string[] = [];
6
+
7
+ constructor(options?: TransformOptions) {
8
+ super({
9
+ ...options,
10
+ readableObjectMode: true
11
+ });
12
+ }
13
+
14
+ decodeRoutine() {
15
+ if (!this.cache.length) {
16
+ return;
17
+ }
18
+
19
+ const vecs = this.cache.join('').split(/\r?\n\r?\n/);
20
+ this.cache.length = 0;
21
+ const lastVec = vecs.pop();
22
+ if (lastVec) {
23
+ this.cache.push(lastVec);
24
+ }
25
+
26
+ for (const x of vecs) {
27
+ const lines: string[] = x.split(/\r?\n/);
28
+
29
+ const event: {
30
+ event?: string;
31
+ data?: string;
32
+ id?: string;
33
+ retry?: number;
34
+ } = {};
35
+
36
+ for (const l of lines) {
37
+ const columnPos = l.indexOf(':');
38
+ if (columnPos <= 0) {
39
+ continue;
40
+ }
41
+ const key = l.substring(0, columnPos);
42
+ const rawValue = l.substring(columnPos + 1);
43
+ const value = rawValue.startsWith(' ') ? rawValue.slice(1) : rawValue;
44
+ if (key === 'data') {
45
+ if (event.data) {
46
+ event.data += value || '\n';
47
+ } else if (event.data === '') {
48
+ event.data += '\n';
49
+ event.data += value || '\n';
50
+ } else {
51
+ event.data = value;
52
+ }
53
+ } else if (key === 'retry') {
54
+ event.retry = parseInt(value, 10);
55
+ } else {
56
+ Reflect.set(event, key, value);
57
+ }
58
+ }
59
+
60
+ if (event.data) {
61
+ const parsed = parseJSONText(event.data);
62
+ if (parsed && typeof parsed === 'object') {
63
+ event.data = parsed;
64
+ }
65
+ }
66
+
67
+ if (Object.keys(event).length) {
68
+ this.push(event);
69
+ }
70
+ }
71
+ }
72
+
73
+ override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
74
+ if (chunk === null) {
75
+ this.push(null);
76
+ }
77
+
78
+ this.cache.push(chunk.toString());
79
+ this.decodeRoutine();
80
+
81
+ callback();
82
+ }
83
+
84
+ override _final(callback: (error?: Error | null | undefined) => void): void {
85
+ this.decodeRoutine();
86
+ callback();
87
+ }
88
+ }
89
+
90
+ @TPM({
91
+ contentType: 'text/event-stream',
92
+ })
93
+ export class OutputServerEventStream extends Transform {
94
+ n: number = 0;
95
+
96
+ constructor(options?: TransformOptions) {
97
+ super({
98
+ ...options, writableObjectMode: true, encoding: 'utf-8'
99
+ });
100
+ }
101
+
102
+ encodeRoutine(chunk: {
103
+ event?: string;
104
+ data?: any;
105
+ id?: string;
106
+ retry?: number;
107
+ } | string) {
108
+ if (typeof chunk === 'object') {
109
+ const lines: string[] = [];
110
+
111
+ if (chunk.event) {
112
+ lines.push(`event: ${chunk.event}`);
113
+ }
114
+ if (chunk.data) {
115
+ if (typeof chunk.data === 'string') {
116
+ for (const x of chunk.data.split(/\r?\n/)) {
117
+ lines.push(`data: ${x}`);
118
+ }
119
+ } else {
120
+ lines.push(`data: ${JSON.stringify(chunk.data)}`);
121
+ }
122
+ }
123
+ if (chunk.id) {
124
+ lines.push(`id: ${chunk.id}`);
125
+ }
126
+ if (chunk.retry) {
127
+ lines.push(`retry: ${chunk.retry}`);
128
+ }
129
+ if (!lines.length) {
130
+ lines.push(`data: ${JSON.stringify(chunk)}`);
131
+ }
132
+ this.push(lines.join('\n'));
133
+ this.push('\n\n');
134
+ this.n++;
135
+
136
+ return;
137
+ } else if (typeof chunk === 'string') {
138
+ const lines: string[] = [];
139
+ for (const x of chunk.split(/\r?\n/)) {
140
+ lines.push(`data: ${x}`);
141
+ }
142
+
143
+ this.push(lines.join('\n'));
144
+ this.push('\n\n');
145
+ this.n++;
146
+ }
147
+ }
148
+
149
+ override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
150
+ if (chunk === null) {
151
+ this.push(null);
152
+ }
153
+
154
+ this.encodeRoutine(chunk);
155
+
156
+ callback();
157
+ }
158
+ }
159
+
160
+ export interface OutputServerEventStream extends Transform {
161
+ write(chunk: string | {
162
+ event?: string;
163
+ data?: any;
164
+ id?: string;
165
+ retry?: number;
166
+ }, callback?: (error: Error | null | undefined) => void): boolean;
167
+ write(chunk: any, callback?: (error: Error | null | undefined) => void): boolean;
168
+ write(chunk: any, encoding: BufferEncoding, callback?: (error: Error | null | undefined) => void): boolean;
169
+ }
{backend/functions/src β†’ src}/services/alt-text.ts RENAMED
File without changes
src/services/async-context.ts ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { GlobalAsyncContext } from 'civkit/async-context';
2
+ import { container, singleton } from 'tsyringe';
3
+
4
+ @singleton()
5
+ export class AsyncLocalContext extends GlobalAsyncContext { }
6
+
7
+ const instance = container.resolve(AsyncLocalContext);
8
+ Reflect.set(process, 'asyncLocalContext', instance);
9
+
10
+ export default instance;
src/services/blackhole-detector.ts ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { singleton } from 'tsyringe';
2
+ import { AsyncService } from 'civkit/async-service';
3
+ import { GlobalLogger } from './logger';
4
+
5
+
6
+ @singleton()
7
+ export class BlackHoleDetector extends AsyncService {
8
+
9
+ logger = this.globalLogger.child({ service: this.constructor.name });
10
+ lastWorkedTs?: number;
11
+ lastDoneRequestTs?: number;
12
+ lastIncomingRequestTs?: number;
13
+
14
+ maxDelay = 1000 * 30;
15
+ concurrentRequests = 0;
16
+
17
+ strikes = 0;
18
+
19
+ constructor(protected globalLogger: GlobalLogger) {
20
+ super(...arguments);
21
+
22
+ if (process.env.NODE_ENV?.startsWith('prod')) {
23
+ setInterval(() => {
24
+ this.routine();
25
+ }, 1000 * 15).unref();
26
+ }
27
+ }
28
+
29
+ override async init() {
30
+ await this.dependencyReady();
31
+ this.logger.debug('BlackHoleDetector started');
32
+ this.emit('ready');
33
+ }
34
+
35
+ routine() {
36
+ const now = Date.now();
37
+ const lastWorked = this.lastWorkedTs;
38
+ if (!lastWorked) {
39
+ return;
40
+ }
41
+ const dt = (now - lastWorked);
42
+ if (this.concurrentRequests > 0 &&
43
+ this.lastIncomingRequestTs && lastWorked &&
44
+ this.lastIncomingRequestTs >= lastWorked &&
45
+ (dt > (this.maxDelay * (this.strikes + 1)))
46
+ ) {
47
+ this.logger.warn(`BlackHole detected, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
48
+ this.strikes += 1;
49
+ }
50
+
51
+ if (this.strikes >= 3) {
52
+ this.logger.error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
53
+ this.emit('error', new Error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`));
54
+ }
55
+ }
56
+
57
+ incomingRequest() {
58
+ this.lastIncomingRequestTs = Date.now();
59
+ this.lastWorkedTs ??= Date.now();
60
+ this.concurrentRequests++;
61
+ }
62
+ doneWithRequest() {
63
+ this.concurrentRequests--;
64
+ this.lastDoneRequestTs = Date.now();
65
+ }
66
+
67
+ itWorked() {
68
+ this.lastWorkedTs = Date.now();
69
+ this.strikes = 0;
70
+ }
71
+
72
+ };
{backend/functions/src β†’ src}/services/brave-search.ts RENAMED
@@ -7,6 +7,7 @@ import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
7
  import { AsyncContext } from '../shared';
8
  import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
9
  import type { Request, Response } from 'express';
 
10
 
11
  @singleton()
12
  export class BraveSearchService extends AsyncService {
@@ -20,6 +21,7 @@ export class BraveSearchService extends AsyncService {
20
  protected secretExposer: SecretExposer,
21
  protected geoipControl: GeoIPService,
22
  protected threadLocal: AsyncContext,
 
23
  ) {
24
  super(...arguments);
25
  }
@@ -69,6 +71,7 @@ export class BraveSearchService extends AsyncService {
69
  while (maxTries--) {
70
  try {
71
  const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
 
72
 
73
  return r.parsed;
74
  } catch (err: any) {
 
7
  import { AsyncContext } from '../shared';
8
  import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
9
  import type { Request, Response } from 'express';
10
+ import { BlackHoleDetector } from './blackhole-detector';
11
 
12
  @singleton()
13
  export class BraveSearchService extends AsyncService {
 
21
  protected secretExposer: SecretExposer,
22
  protected geoipControl: GeoIPService,
23
  protected threadLocal: AsyncContext,
24
+ protected blackHoleDetector: BlackHoleDetector,
25
  ) {
26
  super(...arguments);
27
  }
 
71
  while (maxTries--) {
72
  try {
73
  const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
74
+ this.blackHoleDetector.itWorked();
75
 
76
  return r.parsed;
77
  } catch (err: any) {
src/services/cf-browser-rendering.ts ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { container, singleton } from 'tsyringe';
2
+ import { AsyncService } from 'civkit/async-service';
3
+ import { Logger, SecretExposer } from '../shared';
4
+ import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare';
5
+
6
+ @singleton()
7
+ export class CFBrowserRendering extends AsyncService {
8
+
9
+ logger = this.globalLogger.child({ service: this.constructor.name });
10
+ client!: CloudFlareHTTP;
11
+
12
+ constructor(
13
+ protected globalLogger: Logger,
14
+ protected secretExposer: SecretExposer,
15
+ ) {
16
+ super(...arguments);
17
+ }
18
+
19
+
20
+ override async init() {
21
+ await this.dependencyReady();
22
+ const [account, key] = this.secretExposer.CLOUD_FLARE_API_KEY?.split(':');
23
+ this.client = new CloudFlareHTTP(account, key);
24
+
25
+ this.emit('ready');
26
+ }
27
+
28
+ async fetchContent(url: string) {
29
+ const r = await this.client.fetchBrowserRenderedHTML({ url });
30
+
31
+ return r.parsed.result;
32
+ }
33
+
34
+ }
35
+
36
+ const instance = container.resolve(CFBrowserRendering);
37
+
38
+ export default instance;
src/services/curl.ts ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { marshalErrorLike } from 'civkit/lang';
2
+ import { AsyncService } from 'civkit/async-service';
3
+ import { singleton } from 'tsyringe';
4
+
5
+ import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl';
6
+ import { parseString as parseSetCookieString } from 'set-cookie-parser';
7
+
8
+ import { ScrappingOptions } from './puppeteer';
9
+ import { Logger } from '../shared/services/logger';
10
+ import { AssertionFailureError, FancyFile } from 'civkit';
11
+ import { ServiceBadAttemptError, TempFileManager } from '../shared';
12
+ import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
13
+ import { ZSTDDecompress } from 'simple-zstd';
14
+ import _ from 'lodash';
15
+ import { Readable } from 'stream';
16
+ import { AsyncLocalContext } from './async-context';
17
+
18
+ export interface CURLScrappingOptions extends ScrappingOptions {
19
+ method?: string;
20
+ body?: string | Buffer;
21
+ }
22
+
23
+ @singleton()
24
+ export class CurlControl extends AsyncService {
25
+
26
+ logger = this.globalLogger.child({ service: this.constructor.name });
27
+
28
+ chromeVersion: string = `132`;
29
+ safariVersion: string = `537.36`;
30
+ platform: string = `Linux`;
31
+ ua: string = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`;
32
+
33
+ lifeCycleTrack = new WeakMap();
34
+
35
+ constructor(
36
+ protected globalLogger: Logger,
37
+ protected tempFileManager: TempFileManager,
38
+ protected asyncLocalContext: AsyncLocalContext,
39
+ ) {
40
+ super(...arguments);
41
+ }
42
+
43
+ override async init() {
44
+ await this.dependencyReady();
45
+
46
+ if (process.platform === 'darwin') {
47
+ this.platform = `macOS`;
48
+ } else if (process.platform === 'win32') {
49
+ this.platform = `Windows`;
50
+ }
51
+
52
+ this.emit('ready');
53
+ }
54
+
55
+ impersonateChrome(ua: string) {
56
+ this.chromeVersion = ua.match(/Chrome\/(\d+)/)![1];
57
+ this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)![1];
58
+ this.ua = ua;
59
+ }
60
+
61
+ curlImpersonateHeader(curl: Curl, headers?: object) {
62
+ const mixinHeaders: Record<string, string> = {
63
+ 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
64
+ 'sec-ch-ua-mobile': '?0',
65
+ 'sec-ch-ua-platform': this.platform,
66
+ 'Upgrade-Insecure-Requests': '1',
67
+ 'User-Agent': this.ua,
68
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
69
+ 'Sec-Fetch-Site': 'none',
70
+ 'Sec-Fetch-Mode': 'navigate',
71
+ 'Sec-Fetch-User': '?1',
72
+ 'Sec-Fetch-Dest': 'document',
73
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
74
+ 'Accept-Language': 'en-US,en;q=0.9',
75
+ };
76
+ const headersCopy: Record<string, string | undefined> = { ...headers };
77
+ for (const k of Object.keys(mixinHeaders)) {
78
+ const lowerK = k.toLowerCase();
79
+ if (headersCopy[lowerK]) {
80
+ mixinHeaders[k] = headersCopy[lowerK];
81
+ delete headersCopy[lowerK];
82
+ }
83
+ }
84
+ Object.assign(mixinHeaders, headersCopy);
85
+
86
+ curl.setOpt(Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => {
87
+ if (Array.isArray(v) && v.length) {
88
+ return v.map((v2) => `${k}: ${v2}`);
89
+ }
90
+ return [`${k}: ${v}`];
91
+ }));
92
+
93
+ return curl;
94
+ }
95
+
96
+ urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
97
+ return new Promise<{
98
+ statusCode: number,
99
+ data?: FancyFile,
100
+ headers: HeaderInfo[],
101
+ }>((resolve, reject) => {
102
+ let contentType = '';
103
+ const curl = new Curl();
104
+ curl.enable(CurlFeature.StreamResponse);
105
+ curl.setOpt('URL', urlToCrawl.toString());
106
+ curl.setOpt(Curl.option.FOLLOWLOCATION, false);
107
+ curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
108
+ curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(30_000, crawlOpts?.timeoutMs || 30_000));
109
+ curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
110
+ if (crawlOpts?.method) {
111
+ curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
112
+ }
113
+ if (crawlOpts?.body) {
114
+ curl.setOpt(Curl.option.POSTFIELDS, crawlOpts.body.toString());
115
+ }
116
+
117
+ const headersToSet = { ...crawlOpts?.extraHeaders };
118
+ if (crawlOpts?.cookies?.length) {
119
+ const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${encodeURIComponent(cookie.value)}`);
120
+ headersToSet.cookie ??= cookieChunks.join('; ');
121
+ }
122
+ if (crawlOpts?.referer) {
123
+ headersToSet.referer ??= crawlOpts.referer;
124
+ }
125
+ if (crawlOpts?.overrideUserAgent) {
126
+ headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent;
127
+ }
128
+
129
+ this.curlImpersonateHeader(curl, headersToSet);
130
+
131
+ if (crawlOpts?.proxyUrl) {
132
+ const proxyUrlCopy = new URL(crawlOpts.proxyUrl);
133
+ curl.setOpt(Curl.option.PROXY, proxyUrlCopy.href);
134
+ }
135
+
136
+ let curlStream: Readable | undefined;
137
+ curl.on('error', (err, errCode) => {
138
+ curl.close();
139
+ this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err: marshalErrorLike(err), urlToCrawl });
140
+ if (curlStream) {
141
+ // For some reason, manually emitting error event is required for curlStream.
142
+ curlStream.emit('error', err);
143
+ curlStream.destroy(err);
144
+ }
145
+ const err2 = this.digestCurlCode(errCode, err.message);
146
+ if (err2) {
147
+ reject(err2);
148
+ return;
149
+ }
150
+ reject(new AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`));
151
+ });
152
+ curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
153
+ let status = -1;
154
+ let contentEncoding = '';
155
+ curl.once('end', () => {
156
+ if (curlStream) {
157
+ curlStream.once('end', () => curl.close());
158
+ return;
159
+ }
160
+ curl.close();
161
+ });
162
+ curl.on('stream', (stream, statusCode, headers) => {
163
+ this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode });
164
+ status = statusCode;
165
+ curlStream = stream;
166
+ for (const headerSet of (headers as HeaderInfo[])) {
167
+ for (const [k, v] of Object.entries(headerSet)) {
168
+ if (k.trim().endsWith(':')) {
169
+ Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || '');
170
+ Reflect.deleteProperty(headerSet, k);
171
+ continue;
172
+ }
173
+ if (v === undefined) {
174
+ Reflect.set(headerSet, k, '');
175
+ continue;
176
+ }
177
+ if (k.toLowerCase() === 'content-type' && typeof v === 'string') {
178
+ contentType = v.toLowerCase();
179
+ }
180
+ }
181
+ }
182
+ const lastResHeaders = headers[headers.length - 1];
183
+ for (const [k, v] of Object.entries(lastResHeaders)) {
184
+ const kl = k.toLowerCase();
185
+ if (kl === 'content-type') {
186
+ contentType = v.toLowerCase();
187
+ }
188
+ if (kl === 'content-encoding') {
189
+ contentEncoding = v.toLowerCase();
190
+ }
191
+ if (contentType && contentEncoding) {
192
+ break;
193
+ }
194
+ }
195
+
196
+ if ([301, 302, 307, 308].includes(statusCode)) {
197
+ if (stream) {
198
+ stream.resume();
199
+ }
200
+ resolve({
201
+ statusCode: status,
202
+ data: undefined,
203
+ headers: headers as HeaderInfo[],
204
+ });
205
+ return;
206
+ }
207
+
208
+ if (!stream) {
209
+ resolve({
210
+ statusCode: status,
211
+ data: undefined,
212
+ headers: headers as HeaderInfo[],
213
+ });
214
+ return;
215
+ }
216
+
217
+ switch (contentEncoding) {
218
+ case 'gzip': {
219
+ const decompressed = createGunzip();
220
+ stream.pipe(decompressed);
221
+ stream.once('error', (err) => {
222
+ decompressed.destroy(err);
223
+ });
224
+ stream = decompressed;
225
+ break;
226
+ }
227
+ case 'deflate': {
228
+ const decompressed = createInflate();
229
+ stream.pipe(decompressed);
230
+ stream.once('error', (err) => {
231
+ decompressed.destroy(err);
232
+ });
233
+ stream = decompressed;
234
+ break;
235
+ }
236
+ case 'br': {
237
+ const decompressed = createBrotliDecompress();
238
+ stream.pipe(decompressed);
239
+ stream.once('error', (err) => {
240
+ decompressed.destroy(err);
241
+ });
242
+ stream = decompressed;
243
+ break;
244
+ }
245
+ case 'zstd': {
246
+ const decompressed = ZSTDDecompress();
247
+ stream.pipe(decompressed);
248
+ stream.once('error', (err) => {
249
+ decompressed.destroy(err);
250
+ });
251
+ stream = decompressed;
252
+ break;
253
+ }
254
+ default: {
255
+ break;
256
+ }
257
+ }
258
+
259
+ const fpath = this.tempFileManager.alloc();
260
+ const fancyFile = FancyFile.auto(stream, fpath);
261
+ this.tempFileManager.bindPathTo(fancyFile, fpath);
262
+ resolve({
263
+ statusCode: status,
264
+ data: fancyFile,
265
+ headers: headers as HeaderInfo[],
266
+ });
267
+ });
268
+
269
+ curl.perform();
270
+ });
271
+ }
272
+
273
+ async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
274
+ let leftRedirection = 10;
275
+ let opts = { ...crawlOpts };
276
+ let nextHopUrl = urlToCrawl;
277
+ const fakeHeaderInfos: HeaderInfo[] = [];
278
+ do {
279
+ const r = await this.urlToFile1Shot(nextHopUrl, opts);
280
+
281
+ if ([301, 302, 307, 308].includes(r.statusCode)) {
282
+ const headers = r.headers[r.headers.length - 1];
283
+ const location = headers.Location || headers.location;
284
+ if (!location) {
285
+ throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`);
286
+ }
287
+
288
+ const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
289
+ if (setCookieHeader) {
290
+ const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
291
+ const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
292
+ if (parsed.length) {
293
+ opts.cookies = [...(opts.cookies || []), ...parsed];
294
+ }
295
+ }
296
+
297
+ nextHopUrl = new URL(location, nextHopUrl);
298
+ fakeHeaderInfos.push(...r.headers);
299
+ leftRedirection -= 1;
300
+ continue;
301
+ }
302
+
303
+ return {
304
+ statusCode: r.statusCode,
305
+ data: r.data,
306
+ headers: fakeHeaderInfos.concat(r.headers),
307
+ };
308
+ } while (leftRedirection > 0);
309
+
310
+ throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Too many redirections.`);
311
+ }
312
+
313
+ async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
314
+ const curlResult = await this.urlToFile(targetUrl, crawlOpts);
315
+
316
+ let finalURL = targetUrl;
317
+ const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
318
+ impersonate: {},
319
+ proxyOrigin: {},
320
+ };
321
+ for (const headers of curlResult.headers) {
322
+ sideLoadOpts.impersonate[finalURL.href] = {
323
+ status: headers.result?.code || -1,
324
+ headers: _.omit(headers, 'result'),
325
+ contentType: headers['Content-Type'] || headers['content-type'],
326
+ };
327
+ if (crawlOpts?.proxyUrl) {
328
+ sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
329
+ }
330
+ if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
331
+ const location = headers.Location || headers.location;
332
+ if (!location) {
333
+ throw new Error(`Bad redirection: ${curlResult.headers.length} times`);
334
+ }
335
+ finalURL = new URL(location, finalURL);
336
+ }
337
+ }
338
+ const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
339
+ const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type']).toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream';
340
+ const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
341
+ const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
342
+
343
+ if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) {
344
+ sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
345
+ }
346
+
347
+ // This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
348
+ this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
349
+
350
+ return {
351
+ finalURL,
352
+ sideLoadOpts,
353
+ chain: curlResult.headers,
354
+ status: curlResult.statusCode,
355
+ headers: lastHeaders,
356
+ contentType,
357
+ contentDisposition,
358
+ fileName,
359
+ file: curlResult.data
360
+ };
361
+ }
362
+
363
+ digestCurlCode(code: CurlCode, msg: string) {
364
+ switch (code) {
365
+ // 400 User errors
366
+ case CurlCode.CURLE_GOT_NOTHING:
367
+ case CurlCode.CURLE_COULDNT_RESOLVE_HOST:
368
+ case CurlCode.CURLE_REMOTE_ACCESS_DENIED: {
369
+ return new AssertionFailureError(msg);
370
+ }
371
+
372
+ // Retryable errors
373
+ case CurlCode.CURLE_SSL_CONNECT_ERROR:
374
+ case CurlCode.CURLE_QUIC_CONNECT_ERROR:
375
+ case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
376
+ case CurlCode.CURLE_COULDNT_CONNECT:
377
+ case CurlCode.CURLE_PARTIAL_FILE:
378
+ case CurlCode.CURLE_OPERATION_TIMEDOUT: {
379
+ return new ServiceBadAttemptError(msg);
380
+ }
381
+
382
+ default: {
383
+ return undefined;
384
+ }
385
+ }
386
+ }
387
+ }
src/services/errors.ts ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc';
2
+ import _ from 'lodash';
3
+ import dayjs from 'dayjs';
4
+ import utc from 'dayjs/plugin/utc';
5
+
6
+ dayjs.extend(utc);
7
+
8
+ @StatusCode(50301)
9
+ export class ServiceDisabledError extends ApplicationError { }
10
+
11
+ @StatusCode(50302)
12
+ export class ServiceCrashedError extends ApplicationError { }
13
+
14
+ @StatusCode(50303)
15
+ export class ServiceNodeResourceDrainError extends ApplicationError { }
16
+
17
+ @StatusCode(40104)
18
+ export class EmailUnverifiedError extends ApplicationError { }
19
+
20
+ @StatusCode(40201)
21
+ export class InsufficientCreditsError extends ApplicationError { }
22
+
23
+ @StatusCode(40202)
24
+ export class FreeFeatureLimitError extends ApplicationError { }
25
+
26
+ @StatusCode(40203)
27
+ export class InsufficientBalanceError extends ApplicationError { }
28
+
29
+ @StatusCode(40903)
30
+ export class LockConflictError extends ApplicationError { }
31
+
32
+ @StatusCode(40904)
33
+ export class BudgetExceededError extends ApplicationError { }
34
+
35
+ @StatusCode(45101)
36
+ export class HarmfulContentError extends ApplicationError { }
37
+
38
+ @StatusCode(45102)
39
+ export class SecurityCompromiseError extends ApplicationError { }
40
+
41
+ @StatusCode(41201)
42
+ export class BatchSizeTooLargeError extends ApplicationError { }
43
+
44
+
45
+ @StatusCode(42903)
46
+ export class RateLimitTriggeredError extends ApplicationError {
47
+
48
+ @Prop({
49
+ desc: 'Retry after seconds',
50
+ })
51
+ retryAfter?: number;
52
+
53
+ @Prop({
54
+ desc: 'Retry after date',
55
+ })
56
+ retryAfterDate?: Date;
57
+
58
+ protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() {
59
+ const retryAfter = this.retryAfter || this.retryAfterDate;
60
+ if (!retryAfter) {
61
+ return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL];
62
+ }
63
+
64
+ return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), {
65
+ headers: {
66
+ 'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`,
67
+ }
68
+ });
69
+ }
70
+ }
src/services/finalizer.ts ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { AbstractFinalizerService } from 'civkit/finalizer';
2
+ import { container, singleton } from 'tsyringe';
3
+ import { isMainThread } from 'worker_threads';
4
+ import { GlobalLogger } from './logger';
5
+
6
+ @singleton()
7
+ export class FinalizerService extends AbstractFinalizerService {
8
+
9
+ container = container;
10
+ logger = this.globalLogger.child({ service: this.constructor.name });
11
+
12
+ constructor(protected globalLogger: GlobalLogger) {
13
+ super(...arguments);
14
+ }
15
+
16
+ }
17
+
18
+ const instance = container.resolve(FinalizerService);
19
+ export const { Finalizer } = instance.decorators();
20
+ export default instance;
21
+
22
+ if (isMainThread) {
23
+ instance.serviceReady();
24
+ }
{backend/functions/src β†’ src}/services/geoip.ts RENAMED
File without changes