Spaces:
Build error
Build error
deployment: dedicated server script for cloud-run (#1139)
Browse files* refactor: domain profile and attempt direct engine
* fix: direct engine
* fix: abuse in background phase
* fix
* wip
* use curl-impersonate in custom image
* local pdf for curl
* listen port from env
* fix
* fix
* fix
* fix: ditch http2
* cd: using gh action
* ci: token for thinapps-shared
* ci: setup node lock file path
* ci: tweak
* ci: mmdb
* ci: docker build
* fix: ci
* fix: ci
- .github/workflows/cd.yml +76 -0
- backend/functions/.dockerignore +1 -0
- backend/functions/Dockerfile +37 -0
- backend/functions/package-lock.json +4 -4
- backend/functions/package.json +1 -1
- backend/functions/public/favicon.ico +0 -0
- backend/functions/src/cloud-functions/crawler.ts +69 -35
- backend/functions/src/db/domain-profile.ts +1 -4
- backend/functions/src/dto/scrapping-options.ts +8 -6
- backend/functions/src/services/curl.ts +104 -23
- backend/functions/src/services/pdf-extract.ts +9 -9
- backend/functions/src/services/puppeteer.ts +26 -4
- backend/functions/src/services/snapshot-formatter.ts +2 -1
- backend/functions/src/stand-alone/crawl.ts +151 -0
- backend/functions/src/stand-alone/search.ts +151 -0
.github/workflows/cd.yml
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
run-name: Build push and deploy (CD)
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches:
|
| 5 |
+
- main
|
| 6 |
+
- ci-debug
|
| 7 |
+
tags:
|
| 8 |
+
- '*'
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
build-and-push-to-gcr:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
concurrency:
|
| 14 |
+
group: ${{ github.ref_type == 'branch' && github.ref }}
|
| 15 |
+
cancel-in-progress: true
|
| 16 |
+
defaults:
|
| 17 |
+
run:
|
| 18 |
+
working-directory: backend/functions
|
| 19 |
+
permissions:
|
| 20 |
+
contents: read
|
| 21 |
+
steps:
|
| 22 |
+
- uses: actions/checkout@v4
|
| 23 |
+
with:
|
| 24 |
+
lfs: true
|
| 25 |
+
submodules: true
|
| 26 |
+
token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }}
|
| 27 |
+
- uses: 'google-github-actions/auth@v2'
|
| 28 |
+
with:
|
| 29 |
+
credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
|
| 30 |
+
- name: 'Set up Cloud SDK'
|
| 31 |
+
uses: 'google-github-actions/setup-gcloud@v2'
|
| 32 |
+
- name: "Docker auth"
|
| 33 |
+
run: |-
|
| 34 |
+
gcloud auth configure-docker us-docker.pkg.dev --quiet
|
| 35 |
+
- name: Set controller release version
|
| 36 |
+
run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
|
| 37 |
+
- name: Set up Node.js
|
| 38 |
+
uses: actions/setup-node@v4
|
| 39 |
+
with:
|
| 40 |
+
node-version: 22.12.0
|
| 41 |
+
cache: npm
|
| 42 |
+
cache-dependency-path: backend/functions/package-lock.json
|
| 43 |
+
|
| 44 |
+
- name: npm install
|
| 45 |
+
run: npm ci
|
| 46 |
+
- name: get maxmind mmdb
|
| 47 |
+
run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-City.mmdb
|
| 48 |
+
- name: build application
|
| 49 |
+
run: npm run build
|
| 50 |
+
- name: Set package version
|
| 51 |
+
run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }}
|
| 52 |
+
if: github.ref_type == 'tag'
|
| 53 |
+
- name: Docker meta
|
| 54 |
+
id: meta
|
| 55 |
+
uses: docker/metadata-action@v5
|
| 56 |
+
with:
|
| 57 |
+
images: |
|
| 58 |
+
us-docker.pkg.dev/reader-6b7dc/jina-reader/reader
|
| 59 |
+
- name: Set up QEMU
|
| 60 |
+
uses: docker/setup-qemu-action@v3
|
| 61 |
+
- name: Set up Docker Buildx
|
| 62 |
+
uses: docker/setup-buildx-action@v3
|
| 63 |
+
- name: Build and push
|
| 64 |
+
id: container
|
| 65 |
+
uses: docker/build-push-action@v6
|
| 66 |
+
with:
|
| 67 |
+
context: backend/functions
|
| 68 |
+
push: true
|
| 69 |
+
tags: ${{ steps.meta.outputs.tags }}
|
| 70 |
+
labels: ${{ steps.meta.outputs.labels }}
|
| 71 |
+
- name: Deploy CRAWL with Tag
|
| 72 |
+
run: |
|
| 73 |
+
gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
|
| 74 |
+
- name: Deploy SEARCH with Tag
|
| 75 |
+
run: |
|
| 76 |
+
gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0
|
backend/functions/.dockerignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
node_modules/
|
backend/functions/Dockerfile
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax=docker/dockerfile:1
|
| 2 |
+
FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye
|
| 3 |
+
|
| 4 |
+
FROM node:20
|
| 5 |
+
|
| 6 |
+
RUN apt-get update \
|
| 7 |
+
&& apt-get install -y wget gnupg \
|
| 8 |
+
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
| 9 |
+
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
|
| 10 |
+
&& apt-get update \
|
| 11 |
+
&& apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
|
| 12 |
+
--no-install-recommends \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so
|
| 16 |
+
|
| 17 |
+
RUN groupadd -r jina
|
| 18 |
+
RUN useradd -g jina -G audio,video -m jina
|
| 19 |
+
USER jina
|
| 20 |
+
|
| 21 |
+
WORKDIR /app
|
| 22 |
+
|
| 23 |
+
COPY package.json package-lock.json ./
|
| 24 |
+
RUN npm ci
|
| 25 |
+
|
| 26 |
+
COPY build ./build
|
| 27 |
+
COPY public ./public
|
| 28 |
+
COPY licensed ./licensed
|
| 29 |
+
|
| 30 |
+
RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium
|
| 31 |
+
|
| 32 |
+
ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no
|
| 33 |
+
ENV PORT=8080
|
| 34 |
+
|
| 35 |
+
EXPOSE 3000 3001 8080 8081
|
| 36 |
+
ENTRYPOINT ["node"]
|
| 37 |
+
CMD [ "build/stand-alone/crawl.js" ]
|
backend/functions/package-lock.json
CHANGED
|
@@ -16,7 +16,7 @@
|
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
"busboy": "^1.6.0",
|
| 19 |
-
"civkit": "^0.8.2-
|
| 20 |
"core-js": "^3.37.1",
|
| 21 |
"cors": "^2.8.5",
|
| 22 |
"dayjs": "^1.11.9",
|
|
@@ -3979,9 +3979,9 @@
|
|
| 3979 |
}
|
| 3980 |
},
|
| 3981 |
"node_modules/civkit": {
|
| 3982 |
-
"version": "0.8.2-
|
| 3983 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-
|
| 3984 |
-
"integrity": "sha512-
|
| 3985 |
"license": "AGPL",
|
| 3986 |
"dependencies": {
|
| 3987 |
"lodash": "^4.17.21",
|
|
|
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
"busboy": "^1.6.0",
|
| 19 |
+
"civkit": "^0.8.2-03243fe",
|
| 20 |
"core-js": "^3.37.1",
|
| 21 |
"cors": "^2.8.5",
|
| 22 |
"dayjs": "^1.11.9",
|
|
|
|
| 3979 |
}
|
| 3980 |
},
|
| 3981 |
"node_modules/civkit": {
|
| 3982 |
+
"version": "0.8.2-03243fe",
|
| 3983 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-03243fe.tgz",
|
| 3984 |
+
"integrity": "sha512-hoTxGeGdD27iOCDi51cVY0PHlRN3OSC640QRJ1YSmD42o+LP7mZtbdy8dN7j/FSkPP/5yLuB2ch9BMSOp54POQ==",
|
| 3985 |
"license": "AGPL",
|
| 3986 |
"dependencies": {
|
| 3987 |
"lodash": "^4.17.21",
|
backend/functions/package.json
CHANGED
|
@@ -36,7 +36,7 @@
|
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
"busboy": "^1.6.0",
|
| 39 |
-
"civkit": "^0.8.2-
|
| 40 |
"core-js": "^3.37.1",
|
| 41 |
"cors": "^2.8.5",
|
| 42 |
"dayjs": "^1.11.9",
|
|
|
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
"busboy": "^1.6.0",
|
| 39 |
+
"civkit": "^0.8.2-03243fe",
|
| 40 |
"core-js": "^3.37.1",
|
| 41 |
"cors": "^2.8.5",
|
| 42 |
"dayjs": "^1.11.9",
|
backend/functions/public/favicon.ico
ADDED
|
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -84,6 +84,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 84 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 85 |
}
|
| 86 |
await this.setToCache(options.url, snapshot);
|
|
|
|
|
|
|
| 87 |
});
|
| 88 |
|
| 89 |
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
|
@@ -581,9 +583,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 581 |
|
| 582 |
if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
|
| 583 |
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 584 |
-
...crawlOpts,
|
|
|
|
| 585 |
}, crawlerOpts);
|
| 586 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
| 588 |
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
| 589 |
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
|
@@ -628,18 +635,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 628 |
return;
|
| 629 |
}
|
| 630 |
|
| 631 |
-
if (crawlOpts?.engine
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
| 635 |
-
yield snapshot;
|
| 636 |
-
|
| 637 |
-
return;
|
| 638 |
-
} catch (err) {
|
| 639 |
-
if (!engine.endsWith('?')) {
|
| 640 |
-
throw err;
|
| 641 |
-
}
|
| 642 |
-
}
|
| 643 |
}
|
| 644 |
|
| 645 |
let cache;
|
|
@@ -658,6 +656,24 @@ export class CrawlerHost extends RPCHost {
|
|
| 658 |
return;
|
| 659 |
}
|
| 660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
try {
|
| 662 |
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
|
| 663 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
|
@@ -855,7 +871,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 855 |
}
|
| 856 |
|
| 857 |
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
| 858 |
-
const it = this.cachedScrap(url,
|
| 859 |
|
| 860 |
let lastSnapshot;
|
| 861 |
let lastError;
|
|
@@ -912,36 +928,54 @@ export class CrawlerHost extends RPCHost {
|
|
| 912 |
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
| 913 |
}
|
| 914 |
|
| 915 |
-
async exploreDirectEngine(
|
| 916 |
-
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true);
|
| 917 |
-
|
| 918 |
-
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
| 919 |
-
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
|
| 920 |
-
|
| 921 |
-
let engine = ENGINE_TYPE.DIRECT;
|
| 922 |
-
if (!(thisFormatted.content && knownFormatted.content &&
|
| 923 |
-
thisFormatted.content.trim() === knownFormatted.content.trim())) {
|
| 924 |
-
engine = ENGINE_TYPE.BROWSER;
|
| 925 |
-
}
|
| 926 |
-
|
| 927 |
const realUrl = new URL(knownSnapshot.href);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 928 |
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
.where('domain', '==', targetUrl.origin.toLowerCase())
|
| 932 |
-
.limit(1)
|
| 933 |
-
))[0] || new DomainProfile();
|
| 934 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
|
| 936 |
profile.origin = realUrl.origin.toLowerCase();
|
| 937 |
-
profile.triggerReason ??= 'Auto Explore';
|
| 938 |
profile.triggerUrl = realUrl.href;
|
| 939 |
-
profile.
|
| 940 |
-
profile.
|
| 941 |
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
| 942 |
|
| 943 |
await DomainProfile.save(profile);
|
| 944 |
|
| 945 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 946 |
}
|
| 947 |
}
|
|
|
|
| 84 |
Reflect.set(snapshot, 'locale', options.locale);
|
| 85 |
}
|
| 86 |
await this.setToCache(options.url, snapshot);
|
| 87 |
+
|
| 88 |
+
await this.exploreDirectEngine(snapshot).catch(() => undefined);
|
| 89 |
});
|
| 90 |
|
| 91 |
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
|
|
|
| 583 |
|
| 584 |
if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
|
| 585 |
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
| 586 |
+
...crawlOpts,
|
| 587 |
+
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
|
| 588 |
}, crawlerOpts);
|
| 589 |
|
| 590 |
+
if (!finalAutoSnapshot?.html) {
|
| 591 |
+
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
| 595 |
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
| 596 |
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
|
|
|
| 635 |
return;
|
| 636 |
}
|
| 637 |
|
| 638 |
+
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
| 639 |
+
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
| 640 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
}
|
| 642 |
|
| 643 |
let cache;
|
|
|
|
| 656 |
return;
|
| 657 |
}
|
| 658 |
|
| 659 |
+
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
| 660 |
+
const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
|
| 661 |
+
const domainProfile = await DomainProfile.fromFirestore(digest);
|
| 662 |
+
if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
|
| 663 |
+
try {
|
| 664 |
+
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
| 665 |
+
|
| 666 |
+
// Expect downstream code to "break" here if it's satisfied with the direct engine
|
| 667 |
+
yield snapshot;
|
| 668 |
+
if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
|
| 669 |
+
return;
|
| 670 |
+
}
|
| 671 |
+
} catch (err: any) {
|
| 672 |
+
this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
|
| 673 |
+
}
|
| 674 |
+
}
|
| 675 |
+
}
|
| 676 |
+
|
| 677 |
try {
|
| 678 |
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
|
| 679 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
|
|
|
| 871 |
}
|
| 872 |
|
| 873 |
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
| 874 |
+
const it = this.cachedScrap(url, opts, crawlerOptions);
|
| 875 |
|
| 876 |
let lastSnapshot;
|
| 877 |
let lastError;
|
|
|
|
| 928 |
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
| 929 |
}
|
| 930 |
|
| 931 |
+
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 932 |
const realUrl = new URL(knownSnapshot.href);
|
| 933 |
+
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
|
| 934 |
+
const profile = await DomainProfile.fromFirestore(digest);
|
| 935 |
+
|
| 936 |
+
if (!profile) {
|
| 937 |
+
const record = DomainProfile.from({
|
| 938 |
+
_id: digest,
|
| 939 |
+
origin: realUrl.origin.toLowerCase(),
|
| 940 |
+
path,
|
| 941 |
+
triggerUrl: realUrl.href,
|
| 942 |
+
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
|
| 943 |
+
createdAt: new Date(),
|
| 944 |
+
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
|
| 945 |
+
});
|
| 946 |
+
await DomainProfile.save(record);
|
| 947 |
|
| 948 |
+
return;
|
| 949 |
+
}
|
|
|
|
|
|
|
|
|
|
| 950 |
|
| 951 |
+
if (profile.engine === ENGINE_TYPE.BROWSER) {
|
| 952 |
+
// Mixed engine, always use browser
|
| 953 |
+
return;
|
| 954 |
+
}
|
| 955 |
|
| 956 |
profile.origin = realUrl.origin.toLowerCase();
|
|
|
|
| 957 |
profile.triggerUrl = realUrl.href;
|
| 958 |
+
profile.path = path;
|
| 959 |
+
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
|
| 960 |
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
| 961 |
|
| 962 |
await DomainProfile.save(profile);
|
| 963 |
|
| 964 |
+
return;
|
| 965 |
+
}
|
| 966 |
+
|
| 967 |
+
getDomainProfileUrlDigest(url: URL) {
|
| 968 |
+
const pathname = url.pathname;
|
| 969 |
+
const pathVec = pathname.split('/');
|
| 970 |
+
const parentPath = pathVec.slice(0, -1).join('/');
|
| 971 |
+
|
| 972 |
+
const finalPath = parentPath || pathname;
|
| 973 |
+
|
| 974 |
+
const key = url.origin.toLocaleLowerCase() + finalPath;
|
| 975 |
+
|
| 976 |
+
return {
|
| 977 |
+
digest: md5Hasher.hash(key),
|
| 978 |
+
path: finalPath,
|
| 979 |
+
};
|
| 980 |
}
|
| 981 |
}
|
backend/functions/src/db/domain-profile.ts
CHANGED
|
@@ -13,10 +13,7 @@ export class DomainProfile extends FirestoreRecord {
|
|
| 13 |
@Prop({
|
| 14 |
required: true
|
| 15 |
})
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
@Prop({ required: true })
|
| 19 |
-
triggerReason!: string;
|
| 20 |
|
| 21 |
@Prop()
|
| 22 |
triggerUrl?: string;
|
|
|
|
| 13 |
@Prop({
|
| 14 |
required: true
|
| 15 |
})
|
| 16 |
+
path!: string;
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
@Prop()
|
| 19 |
triggerUrl?: string;
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -439,7 +439,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 439 |
instance.engine = ENGINE_TYPE.BROWSER;
|
| 440 |
instance.respondWith = CONTENT_FORMAT.VLM;
|
| 441 |
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
|
| 442 |
-
instance.engine =
|
| 443 |
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
| 444 |
}
|
| 445 |
|
|
@@ -496,10 +496,6 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 496 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 497 |
}
|
| 498 |
|
| 499 |
-
if (instance.noCache || !instance.isTypicalRequest()) {
|
| 500 |
-
instance.engine ??= ENGINE_TYPE.BROWSER + '?';
|
| 501 |
-
}
|
| 502 |
-
|
| 503 |
return instance;
|
| 504 |
}
|
| 505 |
|
|
@@ -544,13 +540,19 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 544 |
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
|
| 545 |
}
|
| 546 |
|
| 547 |
-
|
| 548 |
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
| 549 |
return false;
|
| 550 |
}
|
| 551 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 552 |
return false;
|
| 553 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
if (this.viewport) {
|
| 555 |
return false;
|
| 556 |
}
|
|
|
|
| 439 |
instance.engine = ENGINE_TYPE.BROWSER;
|
| 440 |
instance.respondWith = CONTENT_FORMAT.VLM;
|
| 441 |
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
|
| 442 |
+
instance.engine = ENGINE_TYPE.AUTO;
|
| 443 |
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
| 444 |
}
|
| 445 |
|
|
|
|
| 496 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 497 |
}
|
| 498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
return instance;
|
| 500 |
}
|
| 501 |
|
|
|
|
| 540 |
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
|
| 541 |
}
|
| 542 |
|
| 543 |
+
browserIsNotRequired() {
|
| 544 |
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
| 545 |
return false;
|
| 546 |
}
|
| 547 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 548 |
return false;
|
| 549 |
}
|
| 550 |
+
if (this.waitForSelector?.length) {
|
| 551 |
+
return false;
|
| 552 |
+
}
|
| 553 |
+
if (this.withIframe || this.withShadowDom) {
|
| 554 |
+
return false;
|
| 555 |
+
}
|
| 556 |
if (this.viewport) {
|
| 557 |
return false;
|
| 558 |
}
|
backend/functions/src/services/curl.ts
CHANGED
|
@@ -2,11 +2,14 @@ import { marshalErrorLike } from 'civkit/lang';
|
|
| 2 |
import { AsyncService } from 'civkit/async-service';
|
| 3 |
import { singleton } from 'tsyringe';
|
| 4 |
|
| 5 |
-
import { Curl, HeaderInfo } from 'node-libcurl';
|
| 6 |
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
| 7 |
import { Logger } from '../shared/services/logger';
|
| 8 |
import { JSDomControl } from './jsdom';
|
| 9 |
-
import { AssertionFailureError } from 'civkit';
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
@singleton()
|
| 12 |
export class CurlControl extends AsyncService {
|
|
@@ -16,6 +19,7 @@ export class CurlControl extends AsyncService {
|
|
| 16 |
constructor(
|
| 17 |
protected globalLogger: Logger,
|
| 18 |
protected jsdomControl: JSDomControl,
|
|
|
|
| 19 |
) {
|
| 20 |
super(...arguments);
|
| 21 |
}
|
|
@@ -26,25 +30,55 @@ export class CurlControl extends AsyncService {
|
|
| 26 |
this.emit('ready');
|
| 27 |
}
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
const result = await new Promise<{
|
| 31 |
statusCode: number,
|
| 32 |
-
data:
|
| 33 |
headers: Buffer | HeaderInfo[],
|
| 34 |
}>((resolve, reject) => {
|
| 35 |
const curl = new Curl();
|
|
|
|
| 36 |
curl.setOpt('URL', urlToCrawl.toString());
|
| 37 |
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
}
|
| 42 |
if (crawlOpts?.overrideUserAgent) {
|
| 43 |
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
| 44 |
}
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
if (crawlOpts?.proxyUrl) {
|
| 49 |
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
| 50 |
}
|
|
@@ -56,35 +90,82 @@ export class CurlControl extends AsyncService {
|
|
| 56 |
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
| 57 |
}
|
| 58 |
|
| 59 |
-
curl.on('end', (statusCode,
|
| 60 |
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
|
| 61 |
-
resolve({
|
| 62 |
-
statusCode,
|
| 63 |
-
data: data.toString(),
|
| 64 |
-
headers,
|
| 65 |
-
});
|
| 66 |
curl.close();
|
| 67 |
});
|
| 68 |
|
| 69 |
curl.on('error', (err) => {
|
| 70 |
-
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
| 71 |
curl.close();
|
|
|
|
| 72 |
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
|
| 73 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
curl.perform();
|
| 76 |
});
|
| 77 |
|
| 78 |
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
| 79 |
-
throw new AssertionFailureError(`Failed to
|
| 80 |
}
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
| 90 |
|
|
|
|
| 2 |
import { AsyncService } from 'civkit/async-service';
|
| 3 |
import { singleton } from 'tsyringe';
|
| 4 |
|
| 5 |
+
import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
|
| 6 |
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
| 7 |
import { Logger } from '../shared/services/logger';
|
| 8 |
import { JSDomControl } from './jsdom';
|
| 9 |
+
import { AssertionFailureError, FancyFile } from 'civkit';
|
| 10 |
+
import { TempFileManager } from '../shared';
|
| 11 |
+
import { readFile } from 'fs/promises';
|
| 12 |
+
import { pathToFileURL } from 'url';
|
| 13 |
|
| 14 |
@singleton()
|
| 15 |
export class CurlControl extends AsyncService {
|
|
|
|
| 19 |
constructor(
|
| 20 |
protected globalLogger: Logger,
|
| 21 |
protected jsdomControl: JSDomControl,
|
| 22 |
+
protected tempFileManager: TempFileManager,
|
| 23 |
) {
|
| 24 |
super(...arguments);
|
| 25 |
}
|
|
|
|
| 30 |
this.emit('ready');
|
| 31 |
}
|
| 32 |
|
| 33 |
+
curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
|
| 34 |
+
const mixinHeaders = {
|
| 35 |
+
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
|
| 36 |
+
'sec-ch-ua-mobile': '?0',
|
| 37 |
+
'sec-ch-ua-platform': 'Windows',
|
| 38 |
+
'Upgrade-Insecure-Requests': '1',
|
| 39 |
+
'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
|
| 40 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
| 41 |
+
'Sec-Fetch-Site': 'none',
|
| 42 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 43 |
+
'Sec-Fetch-User': '?1',
|
| 44 |
+
'Sec-Fetch-Dest': 'document',
|
| 45 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 46 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
|
| 50 |
+
|
| 51 |
+
return curl;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
|
| 55 |
+
const snapshot = {
|
| 56 |
+
href: urlToCrawl.toString(),
|
| 57 |
+
html: '',
|
| 58 |
+
title: '',
|
| 59 |
+
text: '',
|
| 60 |
+
} as PageSnapshot;
|
| 61 |
+
|
| 62 |
const result = await new Promise<{
|
| 63 |
statusCode: number,
|
| 64 |
+
data?: FancyFile,
|
| 65 |
headers: Buffer | HeaderInfo[],
|
| 66 |
}>((resolve, reject) => {
|
| 67 |
const curl = new Curl();
|
| 68 |
+
curl.enable(CurlFeature.StreamResponse);
|
| 69 |
curl.setOpt('URL', urlToCrawl.toString());
|
| 70 |
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
| 71 |
|
| 72 |
+
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
|
| 73 |
+
|
|
|
|
| 74 |
if (crawlOpts?.overrideUserAgent) {
|
| 75 |
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
| 76 |
}
|
| 77 |
+
|
| 78 |
+
this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
|
| 79 |
+
// if (crawlOpts?.extraHeaders) {
|
| 80 |
+
// curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
| 81 |
+
// }
|
| 82 |
if (crawlOpts?.proxyUrl) {
|
| 83 |
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
| 84 |
}
|
|
|
|
| 90 |
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
| 91 |
}
|
| 92 |
|
| 93 |
+
curl.on('end', (statusCode, _data, headers) => {
|
| 94 |
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
curl.close();
|
| 96 |
});
|
| 97 |
|
| 98 |
curl.on('error', (err) => {
|
|
|
|
| 99 |
curl.close();
|
| 100 |
+
this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
|
| 101 |
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
|
| 102 |
});
|
| 103 |
+
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
|
| 104 |
+
let status = -1;
|
| 105 |
+
let contentType = '';
|
| 106 |
+
curl.on('stream', (stream, statusCode, headers) => {
|
| 107 |
+
status = statusCode;
|
| 108 |
+
outerLoop:
|
| 109 |
+
for (const headerVec of headers) {
|
| 110 |
+
for (const [k, v] of Object.entries(headerVec)) {
|
| 111 |
+
if (k.toLowerCase() === 'content-type') {
|
| 112 |
+
contentType = v.toLowerCase();
|
| 113 |
+
break outerLoop;
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
if (!contentType) {
|
| 119 |
+
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
|
| 120 |
+
stream.destroy();
|
| 121 |
+
return;
|
| 122 |
+
}
|
| 123 |
+
if (contentType.startsWith('image/')) {
|
| 124 |
+
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
|
| 125 |
+
stream.destroy();
|
| 126 |
+
resolve({
|
| 127 |
+
statusCode: status,
|
| 128 |
+
headers,
|
| 129 |
+
});
|
| 130 |
+
return;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
const fpath = this.tempFileManager.alloc();
|
| 134 |
+
const fancyFile = FancyFile.auto(stream, fpath);
|
| 135 |
+
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
| 136 |
+
resolve({
|
| 137 |
+
statusCode: status,
|
| 138 |
+
data: fancyFile,
|
| 139 |
+
headers,
|
| 140 |
+
});
|
| 141 |
+
});
|
| 142 |
|
| 143 |
curl.perform();
|
| 144 |
});
|
| 145 |
|
| 146 |
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
| 147 |
+
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
| 148 |
}
|
| 149 |
|
| 150 |
+
if (result.data) {
|
| 151 |
+
const mimeType: string = await result.data.mimeType;
|
| 152 |
+
if (mimeType.startsWith('text/html')) {
|
| 153 |
+
if ((await result.data.size) > 1024 * 1024 * 32) {
|
| 154 |
+
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
| 155 |
+
}
|
| 156 |
+
snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
| 157 |
+
} else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
|
| 158 |
+
if ((await result.data.size) > 1024 * 1024 * 32) {
|
| 159 |
+
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
| 160 |
+
}
|
| 161 |
+
snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
| 162 |
+
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
| 163 |
+
} else if (mimeType.startsWith('application/pdf')) {
|
| 164 |
+
snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
|
| 165 |
+
} else {
|
| 166 |
+
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
|
| 170 |
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
| 171 |
|
backend/functions/src/services/pdf-extract.ts
CHANGED
|
@@ -266,12 +266,12 @@ export class PDFExtractor extends AsyncService {
|
|
| 266 |
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
| 267 |
}
|
| 268 |
|
| 269 |
-
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) {
|
| 270 |
if (!url) {
|
| 271 |
return undefined;
|
| 272 |
}
|
| 273 |
-
|
| 274 |
-
const digest = md5Hasher.hash(
|
| 275 |
|
| 276 |
const data = url;
|
| 277 |
if (typeof url === 'string' && this.isDataUrl(url)) {
|
|
@@ -283,8 +283,8 @@ export class PDFExtractor extends AsyncService {
|
|
| 283 |
if (cache) {
|
| 284 |
const age = Date.now() - cache?.createdAt.valueOf();
|
| 285 |
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
|
| 286 |
-
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${
|
| 287 |
-
url, digest, age, stale, cacheTolerance
|
| 288 |
});
|
| 289 |
|
| 290 |
if (!stale) {
|
|
@@ -306,7 +306,7 @@ export class PDFExtractor extends AsyncService {
|
|
| 306 |
text: cached.text
|
| 307 |
};
|
| 308 |
} catch (err) {
|
| 309 |
-
this.logger.warn(`Unable to load cached content for ${
|
| 310 |
|
| 311 |
return undefined;
|
| 312 |
}
|
|
@@ -324,17 +324,17 @@ export class PDFExtractor extends AsyncService {
|
|
| 324 |
PDFContent.save(
|
| 325 |
PDFContent.from({
|
| 326 |
_id: theID,
|
| 327 |
-
src:
|
| 328 |
meta: extracted?.meta || {},
|
| 329 |
urlDigest: digest,
|
| 330 |
createdAt: new Date(),
|
| 331 |
expireAt: new Date(Date.now() + this.cacheRetentionMs)
|
| 332 |
}).degradeForFireStore()
|
| 333 |
).catch((r) => {
|
| 334 |
-
this.logger.warn(`Unable to cache PDF content for ${
|
| 335 |
});
|
| 336 |
} catch (err) {
|
| 337 |
-
this.logger.warn(`Unable to extract from pdf ${
|
| 338 |
}
|
| 339 |
|
| 340 |
return extracted;
|
|
|
|
| 266 |
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
| 267 |
}
|
| 268 |
|
| 269 |
+
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
|
| 270 |
if (!url) {
|
| 271 |
return undefined;
|
| 272 |
}
|
| 273 |
+
const nameUrl = alternativeUrl || url.toString();
|
| 274 |
+
const digest = md5Hasher.hash(nameUrl);
|
| 275 |
|
| 276 |
const data = url;
|
| 277 |
if (typeof url === 'string' && this.isDataUrl(url)) {
|
|
|
|
| 283 |
if (cache) {
|
| 284 |
const age = Date.now() - cache?.createdAt.valueOf();
|
| 285 |
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
|
| 286 |
+
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
|
| 287 |
+
data: url, url: nameUrl, digest, age, stale, cacheTolerance
|
| 288 |
});
|
| 289 |
|
| 290 |
if (!stale) {
|
|
|
|
| 306 |
text: cached.text
|
| 307 |
};
|
| 308 |
} catch (err) {
|
| 309 |
+
this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err });
|
| 310 |
|
| 311 |
return undefined;
|
| 312 |
}
|
|
|
|
| 324 |
PDFContent.save(
|
| 325 |
PDFContent.from({
|
| 326 |
_id: theID,
|
| 327 |
+
src: nameUrl,
|
| 328 |
meta: extracted?.meta || {},
|
| 329 |
urlDigest: digest,
|
| 330 |
createdAt: new Date(),
|
| 331 |
expireAt: new Date(Date.now() + this.cacheRetentionMs)
|
| 332 |
}).degradeForFireStore()
|
| 333 |
).catch((r) => {
|
| 334 |
+
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
|
| 335 |
});
|
| 336 |
} catch (err) {
|
| 337 |
+
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
|
| 338 |
}
|
| 339 |
|
| 340 |
return extracted;
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -48,6 +48,7 @@ export interface PageSnapshot {
|
|
| 48 |
href: string;
|
| 49 |
rebase?: string;
|
| 50 |
html: string;
|
|
|
|
| 51 |
shadowExpanded?: string;
|
| 52 |
text: string;
|
| 53 |
status?: number;
|
|
@@ -369,7 +370,9 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
|
| 369 |
return false;
|
| 370 |
}
|
| 371 |
|
|
|
|
| 372 |
function giveSnapshot(stopActiveSnapshot) {
|
|
|
|
| 373 |
if (stopActiveSnapshot) {
|
| 374 |
window.haltSnapshot = true;
|
| 375 |
}
|
|
@@ -385,6 +388,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 385 |
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
| 386 |
href: document.location.href,
|
| 387 |
html: document.documentElement?.outerHTML,
|
|
|
|
| 388 |
text: document.body?.innerText,
|
| 389 |
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
| 390 |
parsed: parsed,
|
|
@@ -392,6 +396,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 392 |
maxElemDepth: domAnalysis.maxDepth,
|
| 393 |
elemCount: domAnalysis.elementCount,
|
| 394 |
};
|
|
|
|
|
|
|
|
|
|
| 395 |
if (document.baseURI !== r.href) {
|
| 396 |
r.rebase = document.baseURI;
|
| 397 |
}
|
|
@@ -448,6 +455,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 448 |
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
| 449 |
snMap = new WeakMap<Page, number>();
|
| 450 |
livePages = new Set<Page>();
|
|
|
|
| 451 |
lastPageCratedAt: number = 0;
|
| 452 |
|
| 453 |
rpsCap: number = 500;
|
|
@@ -491,7 +499,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 491 |
}
|
| 492 |
}
|
| 493 |
this.browser = await puppeteer.launch({
|
| 494 |
-
timeout: 10_000
|
|
|
|
| 495 |
}).catch((err: any) => {
|
| 496 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
| 497 |
process.nextTick(() => {
|
|
@@ -611,7 +620,14 @@ export class PuppeteerControl extends AsyncService {
|
|
| 611 |
const dt = Math.ceil((Date.now() - t0) / 1000);
|
| 612 |
const rps = reqCounter / dt;
|
| 613 |
// console.log(`rps: ${rps}`);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
|
|
|
|
|
|
|
|
|
|
| 615 |
if (reqCounter > 1000) {
|
| 616 |
if (rps > 60 || reqCounter > 2000) {
|
| 617 |
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` });
|
|
@@ -676,6 +692,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 676 |
this.logger.info(`Page ${sn} created.`);
|
| 677 |
this.lastPageCratedAt = Date.now();
|
| 678 |
this.livePages.add(page);
|
|
|
|
| 679 |
|
| 680 |
return page;
|
| 681 |
}
|
|
@@ -717,7 +734,6 @@ export class PuppeteerControl extends AsyncService {
|
|
| 717 |
}
|
| 718 |
const sn = this.snMap.get(page);
|
| 719 |
this.logger.info(`Closing page ${sn}`);
|
| 720 |
-
this.livePages.delete(page);
|
| 721 |
await Promise.race([
|
| 722 |
(async () => {
|
| 723 |
const ctx = page.browserContext();
|
|
@@ -731,6 +747,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 731 |
]).catch((err) => {
|
| 732 |
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
|
| 733 |
});
|
|
|
|
|
|
|
| 734 |
}
|
| 735 |
|
| 736 |
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
|
@@ -743,6 +761,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 743 |
const pdfUrls: string[] = [];
|
| 744 |
let navigationResponse: HTTPResponse | undefined;
|
| 745 |
const page = await this.getNextPage();
|
|
|
|
| 746 |
page.on('response', (resp) => {
|
| 747 |
if (resp.request().isNavigationRequest()) {
|
| 748 |
navigationResponse = resp;
|
|
@@ -802,8 +821,6 @@ export class PuppeteerControl extends AsyncService {
|
|
| 802 |
}
|
| 803 |
const sn = this.snMap.get(page);
|
| 804 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 805 |
-
|
| 806 |
-
this.logger.info(`Locale setting: ${options?.locale}`);
|
| 807 |
if (options?.locale) {
|
| 808 |
// Add headers via request interception to walk around this bug
|
| 809 |
// https://github.com/puppeteer/puppeteer/issues/10235
|
|
@@ -896,6 +913,10 @@ export class PuppeteerControl extends AsyncService {
|
|
| 896 |
page.on('snapshot', hdl);
|
| 897 |
page.once('abuse', (event: any) => {
|
| 898 |
this.emit('abuse', { ...event, url: parsedUrl });
|
|
|
|
|
|
|
|
|
|
|
|
|
| 899 |
nextSnapshotDeferred.reject(
|
| 900 |
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
| 901 |
);
|
|
@@ -1071,6 +1092,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1071 |
}
|
| 1072 |
}
|
| 1073 |
} finally {
|
|
|
|
| 1074 |
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
| 1075 |
page.off('snapshot', hdl);
|
| 1076 |
this.ditchPage(page);
|
|
|
|
| 48 |
href: string;
|
| 49 |
rebase?: string;
|
| 50 |
html: string;
|
| 51 |
+
htmlModifiedByJs?: boolean;
|
| 52 |
shadowExpanded?: string;
|
| 53 |
text: string;
|
| 54 |
status?: number;
|
|
|
|
| 370 |
return false;
|
| 371 |
}
|
| 372 |
|
| 373 |
+
let initialHTML;
|
| 374 |
function giveSnapshot(stopActiveSnapshot) {
|
| 375 |
+
initialHTML ??= document.documentElement?.outerHTML;
|
| 376 |
if (stopActiveSnapshot) {
|
| 377 |
window.haltSnapshot = true;
|
| 378 |
}
|
|
|
|
| 388 |
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
| 389 |
href: document.location.href,
|
| 390 |
html: document.documentElement?.outerHTML,
|
| 391 |
+
htmlModifiedByJs: false,
|
| 392 |
text: document.body?.innerText,
|
| 393 |
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
| 394 |
parsed: parsed,
|
|
|
|
| 396 |
maxElemDepth: domAnalysis.maxDepth,
|
| 397 |
elemCount: domAnalysis.elementCount,
|
| 398 |
};
|
| 399 |
+
if (initialHTML) {
|
| 400 |
+
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
| 401 |
+
}
|
| 402 |
if (document.baseURI !== r.href) {
|
| 403 |
r.rebase = document.baseURI;
|
| 404 |
}
|
|
|
|
| 455 |
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
| 456 |
snMap = new WeakMap<Page, number>();
|
| 457 |
livePages = new Set<Page>();
|
| 458 |
+
pagePhase = new WeakMap<Page, 'idle' | 'active' | 'background'>();
|
| 459 |
lastPageCratedAt: number = 0;
|
| 460 |
|
| 461 |
rpsCap: number = 500;
|
|
|
|
| 499 |
}
|
| 500 |
}
|
| 501 |
this.browser = await puppeteer.launch({
|
| 502 |
+
timeout: 10_000,
|
| 503 |
+
args: ['--disable-dev-shm-usage']
|
| 504 |
}).catch((err: any) => {
|
| 505 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
| 506 |
process.nextTick(() => {
|
|
|
|
| 620 |
const dt = Math.ceil((Date.now() - t0) / 1000);
|
| 621 |
const rps = reqCounter / dt;
|
| 622 |
// console.log(`rps: ${rps}`);
|
| 623 |
+
const pagePhase = this.pagePhase.get(page);
|
| 624 |
+
if (pagePhase === 'background') {
|
| 625 |
+
if (rps > 10 || reqCounter > 1000) {
|
| 626 |
+
halt = true;
|
| 627 |
|
| 628 |
+
return req.abort('blockedbyclient', 1000);
|
| 629 |
+
}
|
| 630 |
+
}
|
| 631 |
if (reqCounter > 1000) {
|
| 632 |
if (rps > 60 || reqCounter > 2000) {
|
| 633 |
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` });
|
|
|
|
| 692 |
this.logger.info(`Page ${sn} created.`);
|
| 693 |
this.lastPageCratedAt = Date.now();
|
| 694 |
this.livePages.add(page);
|
| 695 |
+
this.pagePhase.set(page, 'idle');
|
| 696 |
|
| 697 |
return page;
|
| 698 |
}
|
|
|
|
| 734 |
}
|
| 735 |
const sn = this.snMap.get(page);
|
| 736 |
this.logger.info(`Closing page ${sn}`);
|
|
|
|
| 737 |
await Promise.race([
|
| 738 |
(async () => {
|
| 739 |
const ctx = page.browserContext();
|
|
|
|
| 747 |
]).catch((err) => {
|
| 748 |
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
|
| 749 |
});
|
| 750 |
+
this.livePages.delete(page);
|
| 751 |
+
this.pagePhase.delete(page);
|
| 752 |
}
|
| 753 |
|
| 754 |
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
|
|
|
| 761 |
const pdfUrls: string[] = [];
|
| 762 |
let navigationResponse: HTTPResponse | undefined;
|
| 763 |
const page = await this.getNextPage();
|
| 764 |
+
this.pagePhase.set(page, 'active');
|
| 765 |
page.on('response', (resp) => {
|
| 766 |
if (resp.request().isNavigationRequest()) {
|
| 767 |
navigationResponse = resp;
|
|
|
|
| 821 |
}
|
| 822 |
const sn = this.snMap.get(page);
|
| 823 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
|
|
|
|
|
|
| 824 |
if (options?.locale) {
|
| 825 |
// Add headers via request interception to walk around this bug
|
| 826 |
// https://github.com/puppeteer/puppeteer/issues/10235
|
|
|
|
| 913 |
page.on('snapshot', hdl);
|
| 914 |
page.once('abuse', (event: any) => {
|
| 915 |
this.emit('abuse', { ...event, url: parsedUrl });
|
| 916 |
+
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
|
| 917 |
+
this.emit('abuse', { ...event, url: snapshot.href });
|
| 918 |
+
}
|
| 919 |
+
|
| 920 |
nextSnapshotDeferred.reject(
|
| 921 |
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
| 922 |
);
|
|
|
|
| 1092 |
}
|
| 1093 |
}
|
| 1094 |
} finally {
|
| 1095 |
+
this.pagePhase.set(page, 'background');
|
| 1096 |
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
| 1097 |
page.off('snapshot', hdl);
|
| 1098 |
this.ditchPage(page);
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -152,7 +152,8 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 152 |
// in case of Google Web Cache content
|
| 153 |
if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) {
|
| 154 |
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
| 155 |
-
this.threadLocal.get('cacheTolerance')
|
|
|
|
| 156 |
);
|
| 157 |
if (pdf) {
|
| 158 |
pdfMode = true;
|
|
|
|
| 152 |
// in case of Google Web Cache content
|
| 153 |
if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) {
|
| 154 |
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
| 155 |
+
this.threadLocal.get('cacheTolerance'),
|
| 156 |
+
snapshot.pdfs[0].startsWith('http') ? undefined : snapshot.href,
|
| 157 |
);
|
| 158 |
if (pdf) {
|
| 159 |
pdfMode = true;
|
backend/functions/src/stand-alone/crawl.ts
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import 'reflect-metadata';
|
| 2 |
+
import { container, singleton } from 'tsyringe';
|
| 3 |
+
import { initializeApp, applicationDefault } from 'firebase-admin/app';
|
| 4 |
+
|
| 5 |
+
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
|
| 6 |
+
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
|
| 7 |
+
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
|
| 8 |
+
credential: applicationDefault(),
|
| 9 |
+
});
|
| 10 |
+
|
| 11 |
+
initializeApp();
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
import { Logger, CloudFunctionRegistry } from '../shared';
|
| 15 |
+
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
| 16 |
+
import { ExpressServer } from 'civkit/civ-rpc/express';
|
| 17 |
+
import http2 from 'http2';
|
| 18 |
+
import { CrawlerHost } from '../cloud-functions/crawler';
|
| 19 |
+
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
| 20 |
+
import path from 'path';
|
| 21 |
+
import fs from 'fs';
|
| 22 |
+
import { mimeOfExt } from 'civkit/mime';
|
| 23 |
+
import { NextFunction, Request, Response } from 'express';
|
| 24 |
+
|
| 25 |
+
process.on('unhandledRejection', (err) => {
|
| 26 |
+
console.error('Unhandled rejection', err);
|
| 27 |
+
});
|
| 28 |
+
|
| 29 |
+
process.on('uncaughtException', (err) => {
|
| 30 |
+
console.log('Uncaught exception', err);
|
| 31 |
+
|
| 32 |
+
// Looks like Firebase runtime does not handle error properly.
|
| 33 |
+
// Make sure to quit the process.
|
| 34 |
+
console.error('Uncaught exception, process quit.');
|
| 35 |
+
process.nextTick(() => process.exit(1));
|
| 36 |
+
});
|
| 37 |
+
|
| 38 |
+
@singleton()
|
| 39 |
+
export class CrawlStandAloneServer extends ExpressServer {
|
| 40 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 41 |
+
|
| 42 |
+
httpAlternativeServer?: typeof this['httpServer'];
|
| 43 |
+
assets = new Map<string, WalkOutEntity>();
|
| 44 |
+
|
| 45 |
+
constructor(
|
| 46 |
+
protected globalLogger: Logger,
|
| 47 |
+
protected registry: CloudFunctionRegistry,
|
| 48 |
+
protected crawlerHost: CrawlerHost,
|
| 49 |
+
) {
|
| 50 |
+
super(...arguments);
|
| 51 |
+
|
| 52 |
+
registry.allHandsOnDeck().catch(() => void 0);
|
| 53 |
+
registry.title = 'reader';
|
| 54 |
+
registry.version = '0.1.0';
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
h2c() {
|
| 58 |
+
this.httpAlternativeServer = this.httpServer;
|
| 59 |
+
this.httpServer = http2.createServer(this.expressApp);
|
| 60 |
+
// useResourceBasedDefaultTracker();
|
| 61 |
+
|
| 62 |
+
return this;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
override async init() {
|
| 66 |
+
await this.walkForAssets();
|
| 67 |
+
await super.init();
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
async walkForAssets() {
|
| 71 |
+
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
| 72 |
+
|
| 73 |
+
for (const file of files) {
|
| 74 |
+
if (file.type !== 'file') {
|
| 75 |
+
continue;
|
| 76 |
+
}
|
| 77 |
+
this.assets.set(file.relativePath.toString(), file);
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
makeAssetsServingController() {
|
| 82 |
+
return (req: Request, res: Response, next: NextFunction) => {
|
| 83 |
+
const requestPath = req.url;
|
| 84 |
+
const file = requestPath.slice(1);
|
| 85 |
+
if (!file) {
|
| 86 |
+
return next();
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
const asset = this.assets.get(file);
|
| 90 |
+
if (asset?.type !== 'file') {
|
| 91 |
+
return next();
|
| 92 |
+
}
|
| 93 |
+
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
|
| 94 |
+
res.set('Content-Length', asset.stats.size.toString());
|
| 95 |
+
fs.createReadStream(asset.path).pipe(res);
|
| 96 |
+
|
| 97 |
+
return;
|
| 98 |
+
};
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
override listen(port: number) {
|
| 102 |
+
const r = super.listen(port);
|
| 103 |
+
if (this.httpAlternativeServer) {
|
| 104 |
+
const altPort = port + 1;
|
| 105 |
+
this.httpAlternativeServer.listen(altPort, () => {
|
| 106 |
+
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
| 107 |
+
});
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
return r;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
override registerRoutes(): void {
|
| 114 |
+
|
| 115 |
+
const openAPIManager = new OpenAPIManager();
|
| 116 |
+
openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!);
|
| 117 |
+
const openapiJsonPath = '/openapi.json';
|
| 118 |
+
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
|
| 119 |
+
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
|
| 120 |
+
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
|
| 121 |
+
baseURL.search = '';
|
| 122 |
+
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
|
| 123 |
+
info: {
|
| 124 |
+
title: this.registry.title,
|
| 125 |
+
description: `${this.registry.title} openAPI documentations`,
|
| 126 |
+
'x-logo': {
|
| 127 |
+
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
|
| 131 |
+
res.statusCode = 200;
|
| 132 |
+
res.end(JSON.stringify(content));
|
| 133 |
+
});
|
| 134 |
+
|
| 135 |
+
this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('crawl'));
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
protected override featureSelect(): void {
|
| 139 |
+
this.insertAsyncHookMiddleware();
|
| 140 |
+
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
|
| 141 |
+
this.insertLogRequestsMiddleware();
|
| 142 |
+
this.registerOpenAPIDocsRoutes('/docs');
|
| 143 |
+
|
| 144 |
+
this.registerRoutes();
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
const instance = container.resolve(CrawlStandAloneServer);
|
| 148 |
+
|
| 149 |
+
export default instance;
|
| 150 |
+
|
| 151 |
+
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));
|
backend/functions/src/stand-alone/search.ts
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import 'reflect-metadata';
|
| 2 |
+
import { container, singleton } from 'tsyringe';
|
| 3 |
+
import { initializeApp, applicationDefault } from 'firebase-admin/app';
|
| 4 |
+
|
| 5 |
+
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
|
| 6 |
+
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
|
| 7 |
+
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
|
| 8 |
+
credential: applicationDefault(),
|
| 9 |
+
});
|
| 10 |
+
|
| 11 |
+
initializeApp();
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
import { Logger, CloudFunctionRegistry } from '../shared';
|
| 15 |
+
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
| 16 |
+
import { ExpressServer } from 'civkit/civ-rpc/express';
|
| 17 |
+
import http2 from 'http2';
|
| 18 |
+
import { SearcherHost } from '../cloud-functions/searcher';
|
| 19 |
+
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
| 20 |
+
import path from 'path';
|
| 21 |
+
import fs from 'fs';
|
| 22 |
+
import { mimeOfExt } from 'civkit/mime';
|
| 23 |
+
import { NextFunction, Request, Response } from 'express';
|
| 24 |
+
|
| 25 |
+
process.on('unhandledRejection', (err) => {
|
| 26 |
+
console.error('Unhandled rejection', err);
|
| 27 |
+
});
|
| 28 |
+
|
| 29 |
+
process.on('uncaughtException', (err) => {
|
| 30 |
+
console.log('Uncaught exception', err);
|
| 31 |
+
|
| 32 |
+
// Looks like Firebase runtime does not handle error properly.
|
| 33 |
+
// Make sure to quit the process.
|
| 34 |
+
console.error('Uncaught exception, process quit.');
|
| 35 |
+
process.nextTick(() => process.exit(1));
|
| 36 |
+
});
|
| 37 |
+
|
| 38 |
+
@singleton()
|
| 39 |
+
export class SearchStandAloneServer extends ExpressServer {
|
| 40 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 41 |
+
|
| 42 |
+
httpAlternativeServer?: typeof this['httpServer'];
|
| 43 |
+
assets = new Map<string, WalkOutEntity>();
|
| 44 |
+
|
| 45 |
+
constructor(
|
| 46 |
+
protected globalLogger: Logger,
|
| 47 |
+
protected registry: CloudFunctionRegistry,
|
| 48 |
+
protected searcherHost: SearcherHost,
|
| 49 |
+
) {
|
| 50 |
+
super(...arguments);
|
| 51 |
+
|
| 52 |
+
registry.allHandsOnDeck().catch(() => void 0);
|
| 53 |
+
registry.title = 'reader';
|
| 54 |
+
registry.version = '0.1.0';
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
h2c() {
|
| 58 |
+
this.httpAlternativeServer = this.httpServer;
|
| 59 |
+
this.httpServer = http2.createServer(this.expressApp);
|
| 60 |
+
// useResourceBasedDefaultTracker();
|
| 61 |
+
|
| 62 |
+
return this;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
override async init() {
|
| 66 |
+
await this.walkForAssets();
|
| 67 |
+
await super.init();
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
async walkForAssets() {
|
| 71 |
+
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
| 72 |
+
|
| 73 |
+
for (const file of files) {
|
| 74 |
+
if (file.type !== 'file') {
|
| 75 |
+
continue;
|
| 76 |
+
}
|
| 77 |
+
this.assets.set(file.relativePath.toString(), file);
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
makeAssetsServingController() {
|
| 82 |
+
return (req: Request, res: Response, next: NextFunction) => {
|
| 83 |
+
const requestPath = req.url;
|
| 84 |
+
const file = requestPath.slice(1);
|
| 85 |
+
if (!file) {
|
| 86 |
+
return next();
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
const asset = this.assets.get(file);
|
| 90 |
+
if (asset?.type !== 'file') {
|
| 91 |
+
return next();
|
| 92 |
+
}
|
| 93 |
+
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
|
| 94 |
+
res.set('Content-Length', asset.stats.size.toString());
|
| 95 |
+
fs.createReadStream(asset.path).pipe(res);
|
| 96 |
+
|
| 97 |
+
return;
|
| 98 |
+
};
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
override listen(port: number) {
|
| 102 |
+
const r = super.listen(port);
|
| 103 |
+
if (this.httpAlternativeServer) {
|
| 104 |
+
const altPort = port + 1;
|
| 105 |
+
this.httpAlternativeServer.listen(altPort, () => {
|
| 106 |
+
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
| 107 |
+
});
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
return r;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
override registerRoutes(): void {
|
| 114 |
+
|
| 115 |
+
const openAPIManager = new OpenAPIManager();
|
| 116 |
+
openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!);
|
| 117 |
+
const openapiJsonPath = '/openapi.json';
|
| 118 |
+
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
|
| 119 |
+
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
|
| 120 |
+
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
|
| 121 |
+
baseURL.search = '';
|
| 122 |
+
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
|
| 123 |
+
info: {
|
| 124 |
+
title: this.registry.title,
|
| 125 |
+
description: `${this.registry.title} openAPI documentations`,
|
| 126 |
+
'x-logo': {
|
| 127 |
+
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
|
| 131 |
+
res.statusCode = 200;
|
| 132 |
+
res.end(JSON.stringify(content));
|
| 133 |
+
});
|
| 134 |
+
|
| 135 |
+
this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('search'));
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
protected override featureSelect(): void {
|
| 139 |
+
this.insertAsyncHookMiddleware();
|
| 140 |
+
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
|
| 141 |
+
this.insertLogRequestsMiddleware();
|
| 142 |
+
this.registerOpenAPIDocsRoutes('/docs');
|
| 143 |
+
|
| 144 |
+
this.registerRoutes();
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
const instance = container.resolve(SearchStandAloneServer);
|
| 148 |
+
|
| 149 |
+
export default instance;
|
| 150 |
+
|
| 151 |
+
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));
|