nomagick commited on
Commit
6a58de5
·
unverified ·
1 Parent(s): a453ab5

deployment: dedicated server script for cloud-run (#1139)

Browse files

* refactor: domain profile and attempt direct engine

* fix: direct engine

* fix: abuse in background phase

* fix

* wip

* use curl-impersonate in custom image

* local pdf for curl

* listen port from env

* fix

* fix

* fix

* fix: ditch http2

* cd: using gh action

* ci: token for thinapps-shared

* ci: setup node lock file path

* ci: tweak

* ci: mmdb

* ci: docker build

* fix: ci

* fix: ci

.github/workflows/cd.yml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run-name: Build push and deploy (CD)
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ - ci-debug
7
+ tags:
8
+ - '*'
9
+
10
+ jobs:
11
+ build-and-push-to-gcr:
12
+ runs-on: ubuntu-latest
13
+ concurrency:
14
+ group: ${{ github.ref_type == 'branch' && github.ref }}
15
+ cancel-in-progress: true
16
+ defaults:
17
+ run:
18
+ working-directory: backend/functions
19
+ permissions:
20
+ contents: read
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ with:
24
+ lfs: true
25
+ submodules: true
26
+ token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }}
27
+ - uses: 'google-github-actions/auth@v2'
28
+ with:
29
+ credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
30
+ - name: 'Set up Cloud SDK'
31
+ uses: 'google-github-actions/setup-gcloud@v2'
32
+ - name: "Docker auth"
33
+ run: |-
34
+ gcloud auth configure-docker us-docker.pkg.dev --quiet
35
+ - name: Set controller release version
36
+ run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
37
+ - name: Set up Node.js
38
+ uses: actions/setup-node@v4
39
+ with:
40
+ node-version: 22.12.0
41
+ cache: npm
42
+ cache-dependency-path: backend/functions/package-lock.json
43
+
44
+ - name: npm install
45
+ run: npm ci
46
+ - name: get maxmind mmdb
47
+ run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-City.mmdb
48
+ - name: build application
49
+ run: npm run build
50
+ - name: Set package version
51
+ run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }}
52
+ if: github.ref_type == 'tag'
53
+ - name: Docker meta
54
+ id: meta
55
+ uses: docker/metadata-action@v5
56
+ with:
57
+ images: |
58
+ us-docker.pkg.dev/reader-6b7dc/jina-reader/reader
59
+ - name: Set up QEMU
60
+ uses: docker/setup-qemu-action@v3
61
+ - name: Set up Docker Buildx
62
+ uses: docker/setup-buildx-action@v3
63
+ - name: Build and push
64
+ id: container
65
+ uses: docker/build-push-action@v6
66
+ with:
67
+ context: backend/functions
68
+ push: true
69
+ tags: ${{ steps.meta.outputs.tags }}
70
+ labels: ${{ steps.meta.outputs.labels }}
71
+ - name: Deploy CRAWL with Tag
72
+ run: |
73
+ gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
74
+ - name: Deploy SEARCH with Tag
75
+ run: |
76
+ gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0
backend/functions/.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ node_modules/
backend/functions/Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye
3
+
4
+ FROM node:20
5
+
6
+ RUN apt-get update \
7
+ && apt-get install -y wget gnupg \
8
+ && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
9
+ && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
10
+ && apt-get update \
11
+ && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
12
+ --no-install-recommends \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so
16
+
17
+ RUN groupadd -r jina
18
+ RUN useradd -g jina -G audio,video -m jina
19
+ USER jina
20
+
21
+ WORKDIR /app
22
+
23
+ COPY package.json package-lock.json ./
24
+ RUN npm ci
25
+
26
+ COPY build ./build
27
+ COPY public ./public
28
+ COPY licensed ./licensed
29
+
30
+ RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium
31
+
32
+ ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no
33
+ ENV PORT=8080
34
+
35
+ EXPOSE 3000 3001 8080 8081
36
+ ENTRYPOINT ["node"]
37
+ CMD [ "build/stand-alone/crawl.js" ]
backend/functions/package-lock.json CHANGED
@@ -16,7 +16,7 @@
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
  "busboy": "^1.6.0",
19
- "civkit": "^0.8.2-4c0357a",
20
  "core-js": "^3.37.1",
21
  "cors": "^2.8.5",
22
  "dayjs": "^1.11.9",
@@ -3979,9 +3979,9 @@
3979
  }
3980
  },
3981
  "node_modules/civkit": {
3982
- "version": "0.8.2-4c0357a",
3983
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz",
3984
- "integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==",
3985
  "license": "AGPL",
3986
  "dependencies": {
3987
  "lodash": "^4.17.21",
 
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
  "busboy": "^1.6.0",
19
+ "civkit": "^0.8.2-03243fe",
20
  "core-js": "^3.37.1",
21
  "cors": "^2.8.5",
22
  "dayjs": "^1.11.9",
 
3979
  }
3980
  },
3981
  "node_modules/civkit": {
3982
+ "version": "0.8.2-03243fe",
3983
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-03243fe.tgz",
3984
+ "integrity": "sha512-hoTxGeGdD27iOCDi51cVY0PHlRN3OSC640QRJ1YSmD42o+LP7mZtbdy8dN7j/FSkPP/5yLuB2ch9BMSOp54POQ==",
3985
  "license": "AGPL",
3986
  "dependencies": {
3987
  "lodash": "^4.17.21",
backend/functions/package.json CHANGED
@@ -36,7 +36,7 @@
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
38
  "busboy": "^1.6.0",
39
- "civkit": "^0.8.2-4c0357a",
40
  "core-js": "^3.37.1",
41
  "cors": "^2.8.5",
42
  "dayjs": "^1.11.9",
 
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
38
  "busboy": "^1.6.0",
39
+ "civkit": "^0.8.2-03243fe",
40
  "core-js": "^3.37.1",
41
  "cors": "^2.8.5",
42
  "dayjs": "^1.11.9",
backend/functions/public/favicon.ico ADDED
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -84,6 +84,8 @@ export class CrawlerHost extends RPCHost {
84
  Reflect.set(snapshot, 'locale', options.locale);
85
  }
86
  await this.setToCache(options.url, snapshot);
 
 
87
  });
88
 
89
  puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@@ -581,9 +583,14 @@ export class CrawlerHost extends RPCHost {
581
 
582
  if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
583
  const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
584
- ...crawlOpts, engine: ENGINE_TYPE.AUTO
 
585
  }, crawlerOpts);
586
 
 
 
 
 
587
  if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
588
  const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
589
  yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
@@ -628,18 +635,9 @@ export class CrawlerHost extends RPCHost {
628
  return;
629
  }
630
 
631
- if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) {
632
- const engine = crawlOpts?.engine;
633
- try {
634
- const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
635
- yield snapshot;
636
-
637
- return;
638
- } catch (err) {
639
- if (!engine.endsWith('?')) {
640
- throw err;
641
- }
642
- }
643
  }
644
 
645
  let cache;
@@ -658,6 +656,24 @@ export class CrawlerHost extends RPCHost {
658
  return;
659
  }
660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  try {
662
  if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
663
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
@@ -855,7 +871,7 @@ export class CrawlerHost extends RPCHost {
855
  }
856
 
857
  async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
858
- const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
859
 
860
  let lastSnapshot;
861
  let lastError;
@@ -912,36 +928,54 @@ export class CrawlerHost extends RPCHost {
912
  return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
913
  }
914
 
915
- async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
916
- const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true);
917
-
918
- const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
919
- const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
920
-
921
- let engine = ENGINE_TYPE.DIRECT;
922
- if (!(thisFormatted.content && knownFormatted.content &&
923
- thisFormatted.content.trim() === knownFormatted.content.trim())) {
924
- engine = ENGINE_TYPE.BROWSER;
925
- }
926
-
927
  const realUrl = new URL(knownSnapshot.href);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928
 
929
- const profile = (await DomainProfile.fromFirestoreQuery(
930
- DomainProfile.COLLECTION
931
- .where('domain', '==', targetUrl.origin.toLowerCase())
932
- .limit(1)
933
- ))[0] || new DomainProfile();
934
 
 
 
 
 
935
 
936
  profile.origin = realUrl.origin.toLowerCase();
937
- profile.triggerReason ??= 'Auto Explore';
938
  profile.triggerUrl = realUrl.href;
939
- profile.engine = engine;
940
- profile.createdAt ??= new Date();
941
  profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
942
 
943
  await DomainProfile.save(profile);
944
 
945
- return true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
946
  }
947
  }
 
84
  Reflect.set(snapshot, 'locale', options.locale);
85
  }
86
  await this.setToCache(options.url, snapshot);
87
+
88
+ await this.exploreDirectEngine(snapshot).catch(() => undefined);
89
  });
90
 
91
  puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
 
583
 
584
  if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
585
  const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
586
+ ...crawlOpts,
587
+ engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
588
  }, crawlerOpts);
589
 
590
+ if (!finalAutoSnapshot?.html) {
591
+ throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
592
+ }
593
+
594
  if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
595
  const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
596
  yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
 
635
  return;
636
  }
637
 
638
+ if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
639
+ yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
640
+ return;
 
 
 
 
 
 
 
 
 
641
  }
642
 
643
  let cache;
 
656
  return;
657
  }
658
 
659
+ if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
660
+ const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
661
+ const domainProfile = await DomainProfile.fromFirestore(digest);
662
+ if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
663
+ try {
664
+ const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
665
+
666
+ // Expect downstream code to "break" here if it's satisfied with the direct engine
667
+ yield snapshot;
668
+ if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
669
+ return;
670
+ }
671
+ } catch (err: any) {
672
+ this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
673
+ }
674
+ }
675
+ }
676
+
677
  try {
678
  if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
679
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
 
871
  }
872
 
873
  async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
874
+ const it = this.cachedScrap(url, opts, crawlerOptions);
875
 
876
  let lastSnapshot;
877
  let lastError;
 
928
  return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
929
  }
930
 
931
+ async exploreDirectEngine(knownSnapshot: PageSnapshot) {
 
 
 
 
 
 
 
 
 
 
 
932
  const realUrl = new URL(knownSnapshot.href);
933
+ const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
934
+ const profile = await DomainProfile.fromFirestore(digest);
935
+
936
+ if (!profile) {
937
+ const record = DomainProfile.from({
938
+ _id: digest,
939
+ origin: realUrl.origin.toLowerCase(),
940
+ path,
941
+ triggerUrl: realUrl.href,
942
+ engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
943
+ createdAt: new Date(),
944
+ expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
945
+ });
946
+ await DomainProfile.save(record);
947
 
948
+ return;
949
+ }
 
 
 
950
 
951
+ if (profile.engine === ENGINE_TYPE.BROWSER) {
952
+ // Mixed engine, always use browser
953
+ return;
954
+ }
955
 
956
  profile.origin = realUrl.origin.toLowerCase();
 
957
  profile.triggerUrl = realUrl.href;
958
+ profile.path = path;
959
+ profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
960
  profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
961
 
962
  await DomainProfile.save(profile);
963
 
964
+ return;
965
+ }
966
+
967
+ getDomainProfileUrlDigest(url: URL) {
968
+ const pathname = url.pathname;
969
+ const pathVec = pathname.split('/');
970
+ const parentPath = pathVec.slice(0, -1).join('/');
971
+
972
+ const finalPath = parentPath || pathname;
973
+
974
+ const key = url.origin.toLocaleLowerCase() + finalPath;
975
+
976
+ return {
977
+ digest: md5Hasher.hash(key),
978
+ path: finalPath,
979
+ };
980
  }
981
  }
backend/functions/src/db/domain-profile.ts CHANGED
@@ -13,10 +13,7 @@ export class DomainProfile extends FirestoreRecord {
13
  @Prop({
14
  required: true
15
  })
16
- origin!: string;
17
-
18
- @Prop({ required: true })
19
- triggerReason!: string;
20
 
21
  @Prop()
22
  triggerUrl?: string;
 
13
  @Prop({
14
  required: true
15
  })
16
+ path!: string;
 
 
 
17
 
18
  @Prop()
19
  triggerUrl?: string;
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -439,7 +439,7 @@ export class CrawlerOptions extends AutoCastable {
439
  instance.engine = ENGINE_TYPE.BROWSER;
440
  instance.respondWith = CONTENT_FORMAT.VLM;
441
  } else if (instance.engine === ENGINE_TYPE.READER_LM) {
442
- instance.engine = undefined;
443
  instance.respondWith = CONTENT_FORMAT.READER_LM;
444
  }
445
 
@@ -496,10 +496,6 @@ export class CrawlerOptions extends AutoCastable {
496
  instance.cacheTolerance = instance.cacheTolerance * 1000;
497
  }
498
 
499
- if (instance.noCache || !instance.isTypicalRequest()) {
500
- instance.engine ??= ENGINE_TYPE.BROWSER + '?';
501
- }
502
-
503
  return instance;
504
  }
505
 
@@ -544,13 +540,19 @@ export class CrawlerOptions extends AutoCastable {
544
  return !CONTENT_FORMAT_VALUES.has(this.respondWith);
545
  }
546
 
547
- isTypicalRequest() {
548
  if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
549
  return false;
550
  }
551
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
552
  return false;
553
  }
 
 
 
 
 
 
554
  if (this.viewport) {
555
  return false;
556
  }
 
439
  instance.engine = ENGINE_TYPE.BROWSER;
440
  instance.respondWith = CONTENT_FORMAT.VLM;
441
  } else if (instance.engine === ENGINE_TYPE.READER_LM) {
442
+ instance.engine = ENGINE_TYPE.AUTO;
443
  instance.respondWith = CONTENT_FORMAT.READER_LM;
444
  }
445
 
 
496
  instance.cacheTolerance = instance.cacheTolerance * 1000;
497
  }
498
 
 
 
 
 
499
  return instance;
500
  }
501
 
 
540
  return !CONTENT_FORMAT_VALUES.has(this.respondWith);
541
  }
542
 
543
+ browserIsNotRequired() {
544
  if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
545
  return false;
546
  }
547
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
548
  return false;
549
  }
550
+ if (this.waitForSelector?.length) {
551
+ return false;
552
+ }
553
+ if (this.withIframe || this.withShadowDom) {
554
+ return false;
555
+ }
556
  if (this.viewport) {
557
  return false;
558
  }
backend/functions/src/services/curl.ts CHANGED
@@ -2,11 +2,14 @@ import { marshalErrorLike } from 'civkit/lang';
2
  import { AsyncService } from 'civkit/async-service';
3
  import { singleton } from 'tsyringe';
4
 
5
- import { Curl, HeaderInfo } from 'node-libcurl';
6
  import { PageSnapshot, ScrappingOptions } from './puppeteer';
7
  import { Logger } from '../shared/services/logger';
8
  import { JSDomControl } from './jsdom';
9
- import { AssertionFailureError } from 'civkit';
 
 
 
10
 
11
  @singleton()
12
  export class CurlControl extends AsyncService {
@@ -16,6 +19,7 @@ export class CurlControl extends AsyncService {
16
  constructor(
17
  protected globalLogger: Logger,
18
  protected jsdomControl: JSDomControl,
 
19
  ) {
20
  super(...arguments);
21
  }
@@ -26,25 +30,55 @@ export class CurlControl extends AsyncService {
26
  this.emit('ready');
27
  }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
 
 
 
 
 
 
 
30
  const result = await new Promise<{
31
  statusCode: number,
32
- data: string,
33
  headers: Buffer | HeaderInfo[],
34
  }>((resolve, reject) => {
35
  const curl = new Curl();
 
36
  curl.setOpt('URL', urlToCrawl.toString());
37
  curl.setOpt(Curl.option.FOLLOWLOCATION, true);
38
 
39
- if (crawlOpts?.timeoutMs) {
40
- curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
41
- }
42
  if (crawlOpts?.overrideUserAgent) {
43
  curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
44
  }
45
- if (crawlOpts?.extraHeaders) {
46
- curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
47
- }
 
 
48
  if (crawlOpts?.proxyUrl) {
49
  curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
50
  }
@@ -56,35 +90,82 @@ export class CurlControl extends AsyncService {
56
  curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
57
  }
58
 
59
- curl.on('end', (statusCode, data, headers) => {
60
  this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
61
- resolve({
62
- statusCode,
63
- data: data.toString(),
64
- headers,
65
- });
66
  curl.close();
67
  });
68
 
69
  curl.on('error', (err) => {
70
- this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
71
  curl.close();
 
72
  reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
73
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  curl.perform();
76
  });
77
 
78
  if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
79
- throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
80
  }
81
 
82
- const snapshot = {
83
- href: urlToCrawl.toString(),
84
- html: result.data,
85
- title: '',
86
- text: '',
87
- } as PageSnapshot;
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
90
 
 
2
  import { AsyncService } from 'civkit/async-service';
3
  import { singleton } from 'tsyringe';
4
 
5
+ import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
6
  import { PageSnapshot, ScrappingOptions } from './puppeteer';
7
  import { Logger } from '../shared/services/logger';
8
  import { JSDomControl } from './jsdom';
9
+ import { AssertionFailureError, FancyFile } from 'civkit';
10
+ import { TempFileManager } from '../shared';
11
+ import { readFile } from 'fs/promises';
12
+ import { pathToFileURL } from 'url';
13
 
14
  @singleton()
15
  export class CurlControl extends AsyncService {
 
19
  constructor(
20
  protected globalLogger: Logger,
21
  protected jsdomControl: JSDomControl,
22
+ protected tempFileManager: TempFileManager,
23
  ) {
24
  super(...arguments);
25
  }
 
30
  this.emit('ready');
31
  }
32
 
33
+ curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
34
+ const mixinHeaders = {
35
+ 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
36
+ 'sec-ch-ua-mobile': '?0',
37
+ 'sec-ch-ua-platform': 'Windows',
38
+ 'Upgrade-Insecure-Requests': '1',
39
+ 'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
40
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
41
+ 'Sec-Fetch-Site': 'none',
42
+ 'Sec-Fetch-Mode': 'navigate',
43
+ 'Sec-Fetch-User': '?1',
44
+ 'Sec-Fetch-Dest': 'document',
45
+ 'Accept-Encoding': 'gzip, deflate, br',
46
+ 'Accept-Language': 'en-US,en;q=0.9',
47
+ };
48
+
49
+ curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
50
+
51
+ return curl;
52
+ }
53
+
54
  async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
55
+ const snapshot = {
56
+ href: urlToCrawl.toString(),
57
+ html: '',
58
+ title: '',
59
+ text: '',
60
+ } as PageSnapshot;
61
+
62
  const result = await new Promise<{
63
  statusCode: number,
64
+ data?: FancyFile,
65
  headers: Buffer | HeaderInfo[],
66
  }>((resolve, reject) => {
67
  const curl = new Curl();
68
+ curl.enable(CurlFeature.StreamResponse);
69
  curl.setOpt('URL', urlToCrawl.toString());
70
  curl.setOpt(Curl.option.FOLLOWLOCATION, true);
71
 
72
+ curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
73
+
 
74
  if (crawlOpts?.overrideUserAgent) {
75
  curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
76
  }
77
+
78
+ this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
79
+ // if (crawlOpts?.extraHeaders) {
80
+ // curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
81
+ // }
82
  if (crawlOpts?.proxyUrl) {
83
  curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
84
  }
 
90
  curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
91
  }
92
 
93
+ curl.on('end', (statusCode, _data, headers) => {
94
  this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
 
 
 
 
 
95
  curl.close();
96
  });
97
 
98
  curl.on('error', (err) => {
 
99
  curl.close();
100
+ this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
101
  reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
102
  });
103
+ curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
104
+ let status = -1;
105
+ let contentType = '';
106
+ curl.on('stream', (stream, statusCode, headers) => {
107
+ status = statusCode;
108
+ outerLoop:
109
+ for (const headerVec of headers) {
110
+ for (const [k, v] of Object.entries(headerVec)) {
111
+ if (k.toLowerCase() === 'content-type') {
112
+ contentType = v.toLowerCase();
113
+ break outerLoop;
114
+ }
115
+ }
116
+ }
117
+
118
+ if (!contentType) {
119
+ reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
120
+ stream.destroy();
121
+ return;
122
+ }
123
+ if (contentType.startsWith('image/')) {
124
+ snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
125
+ stream.destroy();
126
+ resolve({
127
+ statusCode: status,
128
+ headers,
129
+ });
130
+ return;
131
+ }
132
+
133
+ const fpath = this.tempFileManager.alloc();
134
+ const fancyFile = FancyFile.auto(stream, fpath);
135
+ this.tempFileManager.bindPathTo(fancyFile, fpath);
136
+ resolve({
137
+ statusCode: status,
138
+ data: fancyFile,
139
+ headers,
140
+ });
141
+ });
142
 
143
  curl.perform();
144
  });
145
 
146
  if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
147
+ throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
148
  }
149
 
150
+ if (result.data) {
151
+ const mimeType: string = await result.data.mimeType;
152
+ if (mimeType.startsWith('text/html')) {
153
+ if ((await result.data.size) > 1024 * 1024 * 32) {
154
+ throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
155
+ }
156
+ snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
157
+ } else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
158
+ if ((await result.data.size) > 1024 * 1024 * 32) {
159
+ throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
160
+ }
161
+ snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
162
+ snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
163
+ } else if (mimeType.startsWith('application/pdf')) {
164
+ snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
165
+ } else {
166
+ throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
167
+ }
168
+ }
169
 
170
  const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
171
 
backend/functions/src/services/pdf-extract.ts CHANGED
@@ -266,12 +266,12 @@ export class PDFExtractor extends AsyncService {
266
  return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
267
  }
268
 
269
- async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) {
270
  if (!url) {
271
  return undefined;
272
  }
273
-
274
- const digest = md5Hasher.hash(url.toString());
275
 
276
  const data = url;
277
  if (typeof url === 'string' && this.isDataUrl(url)) {
@@ -283,8 +283,8 @@ export class PDFExtractor extends AsyncService {
283
  if (cache) {
284
  const age = Date.now() - cache?.createdAt.valueOf();
285
  const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
286
- this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${url}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
287
- url, digest, age, stale, cacheTolerance
288
  });
289
 
290
  if (!stale) {
@@ -306,7 +306,7 @@ export class PDFExtractor extends AsyncService {
306
  text: cached.text
307
  };
308
  } catch (err) {
309
- this.logger.warn(`Unable to load cached content for ${url}`, { err });
310
 
311
  return undefined;
312
  }
@@ -324,17 +324,17 @@ export class PDFExtractor extends AsyncService {
324
  PDFContent.save(
325
  PDFContent.from({
326
  _id: theID,
327
- src: url.toString(),
328
  meta: extracted?.meta || {},
329
  urlDigest: digest,
330
  createdAt: new Date(),
331
  expireAt: new Date(Date.now() + this.cacheRetentionMs)
332
  }).degradeForFireStore()
333
  ).catch((r) => {
334
- this.logger.warn(`Unable to cache PDF content for ${url}`, { err: r });
335
  });
336
  } catch (err) {
337
- this.logger.warn(`Unable to extract from pdf ${url}`, { err });
338
  }
339
 
340
  return extracted;
 
266
  return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
267
  }
268
 
269
+ async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
270
  if (!url) {
271
  return undefined;
272
  }
273
+ const nameUrl = alternativeUrl || url.toString();
274
+ const digest = md5Hasher.hash(nameUrl);
275
 
276
  const data = url;
277
  if (typeof url === 'string' && this.isDataUrl(url)) {
 
283
  if (cache) {
284
  const age = Date.now() - cache?.createdAt.valueOf();
285
  const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
286
+ this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
287
+ data: url, url: nameUrl, digest, age, stale, cacheTolerance
288
  });
289
 
290
  if (!stale) {
 
306
  text: cached.text
307
  };
308
  } catch (err) {
309
+ this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err });
310
 
311
  return undefined;
312
  }
 
324
  PDFContent.save(
325
  PDFContent.from({
326
  _id: theID,
327
+ src: nameUrl,
328
  meta: extracted?.meta || {},
329
  urlDigest: digest,
330
  createdAt: new Date(),
331
  expireAt: new Date(Date.now() + this.cacheRetentionMs)
332
  }).degradeForFireStore()
333
  ).catch((r) => {
334
+ this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
335
  });
336
  } catch (err) {
337
+ this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
338
  }
339
 
340
  return extracted;
backend/functions/src/services/puppeteer.ts CHANGED
@@ -48,6 +48,7 @@ export interface PageSnapshot {
48
  href: string;
49
  rebase?: string;
50
  html: string;
 
51
  shadowExpanded?: string;
52
  text: string;
53
  status?: number;
@@ -369,7 +370,9 @@ function shadowDomPresent(rootElement = document.documentElement) {
369
  return false;
370
  }
371
 
 
372
  function giveSnapshot(stopActiveSnapshot) {
 
373
  if (stopActiveSnapshot) {
374
  window.haltSnapshot = true;
375
  }
@@ -385,6 +388,7 @@ function giveSnapshot(stopActiveSnapshot) {
385
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
386
  href: document.location.href,
387
  html: document.documentElement?.outerHTML,
 
388
  text: document.body?.innerText,
389
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
390
  parsed: parsed,
@@ -392,6 +396,9 @@ function giveSnapshot(stopActiveSnapshot) {
392
  maxElemDepth: domAnalysis.maxDepth,
393
  elemCount: domAnalysis.elementCount,
394
  };
 
 
 
395
  if (document.baseURI !== r.href) {
396
  r.rebase = document.baseURI;
397
  }
@@ -448,6 +455,7 @@ export class PuppeteerControl extends AsyncService {
448
  finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
449
  snMap = new WeakMap<Page, number>();
450
  livePages = new Set<Page>();
 
451
  lastPageCratedAt: number = 0;
452
 
453
  rpsCap: number = 500;
@@ -491,7 +499,8 @@ export class PuppeteerControl extends AsyncService {
491
  }
492
  }
493
  this.browser = await puppeteer.launch({
494
- timeout: 10_000
 
495
  }).catch((err: any) => {
496
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
497
  process.nextTick(() => {
@@ -611,7 +620,14 @@ export class PuppeteerControl extends AsyncService {
611
  const dt = Math.ceil((Date.now() - t0) / 1000);
612
  const rps = reqCounter / dt;
613
  // console.log(`rps: ${rps}`);
 
 
 
 
614
 
 
 
 
615
  if (reqCounter > 1000) {
616
  if (rps > 60 || reqCounter > 2000) {
617
  page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` });
@@ -676,6 +692,7 @@ export class PuppeteerControl extends AsyncService {
676
  this.logger.info(`Page ${sn} created.`);
677
  this.lastPageCratedAt = Date.now();
678
  this.livePages.add(page);
 
679
 
680
  return page;
681
  }
@@ -717,7 +734,6 @@ export class PuppeteerControl extends AsyncService {
717
  }
718
  const sn = this.snMap.get(page);
719
  this.logger.info(`Closing page ${sn}`);
720
- this.livePages.delete(page);
721
  await Promise.race([
722
  (async () => {
723
  const ctx = page.browserContext();
@@ -731,6 +747,8 @@ export class PuppeteerControl extends AsyncService {
731
  ]).catch((err) => {
732
  this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
733
  });
 
 
734
  }
735
 
736
  async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
@@ -743,6 +761,7 @@ export class PuppeteerControl extends AsyncService {
743
  const pdfUrls: string[] = [];
744
  let navigationResponse: HTTPResponse | undefined;
745
  const page = await this.getNextPage();
 
746
  page.on('response', (resp) => {
747
  if (resp.request().isNavigationRequest()) {
748
  navigationResponse = resp;
@@ -802,8 +821,6 @@ export class PuppeteerControl extends AsyncService {
802
  }
803
  const sn = this.snMap.get(page);
804
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
805
-
806
- this.logger.info(`Locale setting: ${options?.locale}`);
807
  if (options?.locale) {
808
  // Add headers via request interception to walk around this bug
809
  // https://github.com/puppeteer/puppeteer/issues/10235
@@ -896,6 +913,10 @@ export class PuppeteerControl extends AsyncService {
896
  page.on('snapshot', hdl);
897
  page.once('abuse', (event: any) => {
898
  this.emit('abuse', { ...event, url: parsedUrl });
 
 
 
 
899
  nextSnapshotDeferred.reject(
900
  new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
901
  );
@@ -1071,6 +1092,7 @@ export class PuppeteerControl extends AsyncService {
1071
  }
1072
  }
1073
  } finally {
 
1074
  (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
1075
  page.off('snapshot', hdl);
1076
  this.ditchPage(page);
 
48
  href: string;
49
  rebase?: string;
50
  html: string;
51
+ htmlModifiedByJs?: boolean;
52
  shadowExpanded?: string;
53
  text: string;
54
  status?: number;
 
370
  return false;
371
  }
372
 
373
+ let initialHTML;
374
  function giveSnapshot(stopActiveSnapshot) {
375
+ initialHTML ??= document.documentElement?.outerHTML;
376
  if (stopActiveSnapshot) {
377
  window.haltSnapshot = true;
378
  }
 
388
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
389
  href: document.location.href,
390
  html: document.documentElement?.outerHTML,
391
+ htmlModifiedByJs: false,
392
  text: document.body?.innerText,
393
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
394
  parsed: parsed,
 
396
  maxElemDepth: domAnalysis.maxDepth,
397
  elemCount: domAnalysis.elementCount,
398
  };
399
+ if (initialHTML) {
400
+ r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
401
+ }
402
  if (document.baseURI !== r.href) {
403
  r.rebase = document.baseURI;
404
  }
 
455
  finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
456
  snMap = new WeakMap<Page, number>();
457
  livePages = new Set<Page>();
458
+ pagePhase = new WeakMap<Page, 'idle' | 'active' | 'background'>();
459
  lastPageCratedAt: number = 0;
460
 
461
  rpsCap: number = 500;
 
499
  }
500
  }
501
  this.browser = await puppeteer.launch({
502
+ timeout: 10_000,
503
+ args: ['--disable-dev-shm-usage']
504
  }).catch((err: any) => {
505
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
506
  process.nextTick(() => {
 
620
  const dt = Math.ceil((Date.now() - t0) / 1000);
621
  const rps = reqCounter / dt;
622
  // console.log(`rps: ${rps}`);
623
+ const pagePhase = this.pagePhase.get(page);
624
+ if (pagePhase === 'background') {
625
+ if (rps > 10 || reqCounter > 1000) {
626
+ halt = true;
627
 
628
+ return req.abort('blockedbyclient', 1000);
629
+ }
630
+ }
631
  if (reqCounter > 1000) {
632
  if (rps > 60 || reqCounter > 2000) {
633
  page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` });
 
692
  this.logger.info(`Page ${sn} created.`);
693
  this.lastPageCratedAt = Date.now();
694
  this.livePages.add(page);
695
+ this.pagePhase.set(page, 'idle');
696
 
697
  return page;
698
  }
 
734
  }
735
  const sn = this.snMap.get(page);
736
  this.logger.info(`Closing page ${sn}`);
 
737
  await Promise.race([
738
  (async () => {
739
  const ctx = page.browserContext();
 
747
  ]).catch((err) => {
748
  this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
749
  });
750
+ this.livePages.delete(page);
751
+ this.pagePhase.delete(page);
752
  }
753
 
754
  async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
 
761
  const pdfUrls: string[] = [];
762
  let navigationResponse: HTTPResponse | undefined;
763
  const page = await this.getNextPage();
764
+ this.pagePhase.set(page, 'active');
765
  page.on('response', (resp) => {
766
  if (resp.request().isNavigationRequest()) {
767
  navigationResponse = resp;
 
821
  }
822
  const sn = this.snMap.get(page);
823
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
 
 
824
  if (options?.locale) {
825
  // Add headers via request interception to walk around this bug
826
  // https://github.com/puppeteer/puppeteer/issues/10235
 
913
  page.on('snapshot', hdl);
914
  page.once('abuse', (event: any) => {
915
  this.emit('abuse', { ...event, url: parsedUrl });
916
+ if (snapshot?.href && parsedUrl.href !== snapshot.href) {
917
+ this.emit('abuse', { ...event, url: snapshot.href });
918
+ }
919
+
920
  nextSnapshotDeferred.reject(
921
  new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
922
  );
 
1092
  }
1093
  }
1094
  } finally {
1095
+ this.pagePhase.set(page, 'background');
1096
  (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
1097
  page.off('snapshot', hdl);
1098
  this.ditchPage(page);
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -152,7 +152,8 @@ export class SnapshotFormatter extends AsyncService {
152
  // in case of Google Web Cache content
153
  if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) {
154
  const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
155
- this.threadLocal.get('cacheTolerance')
 
156
  );
157
  if (pdf) {
158
  pdfMode = true;
 
152
  // in case of Google Web Cache content
153
  if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) {
154
  const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
155
+ this.threadLocal.get('cacheTolerance'),
156
+ snapshot.pdfs[0].startsWith('http') ? undefined : snapshot.href,
157
  );
158
  if (pdf) {
159
  pdfMode = true;
backend/functions/src/stand-alone/crawl.ts ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import 'reflect-metadata';
2
+ import { container, singleton } from 'tsyringe';
3
+ import { initializeApp, applicationDefault } from 'firebase-admin/app';
4
+
5
+ process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
6
+ projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
7
+ storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
8
+ credential: applicationDefault(),
9
+ });
10
+
11
+ initializeApp();
12
+
13
+
14
+ import { Logger, CloudFunctionRegistry } from '../shared';
15
+ import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
16
+ import { ExpressServer } from 'civkit/civ-rpc/express';
17
+ import http2 from 'http2';
18
+ import { CrawlerHost } from '../cloud-functions/crawler';
19
+ import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
20
+ import path from 'path';
21
+ import fs from 'fs';
22
+ import { mimeOfExt } from 'civkit/mime';
23
+ import { NextFunction, Request, Response } from 'express';
24
+
25
+ process.on('unhandledRejection', (err) => {
26
+ console.error('Unhandled rejection', err);
27
+ });
28
+
29
+ process.on('uncaughtException', (err) => {
30
+ console.log('Uncaught exception', err);
31
+
32
+ // Looks like Firebase runtime does not handle error properly.
33
+ // Make sure to quit the process.
34
+ console.error('Uncaught exception, process quit.');
35
+ process.nextTick(() => process.exit(1));
36
+ });
37
+
38
+ @singleton()
39
+ export class CrawlStandAloneServer extends ExpressServer {
40
+ logger = this.globalLogger.child({ service: this.constructor.name });
41
+
42
+ httpAlternativeServer?: typeof this['httpServer'];
43
+ assets = new Map<string, WalkOutEntity>();
44
+
45
+ constructor(
46
+ protected globalLogger: Logger,
47
+ protected registry: CloudFunctionRegistry,
48
+ protected crawlerHost: CrawlerHost,
49
+ ) {
50
+ super(...arguments);
51
+
52
+ registry.allHandsOnDeck().catch(() => void 0);
53
+ registry.title = 'reader';
54
+ registry.version = '0.1.0';
55
+ }
56
+
57
+ h2c() {
58
+ this.httpAlternativeServer = this.httpServer;
59
+ this.httpServer = http2.createServer(this.expressApp);
60
+ // useResourceBasedDefaultTracker();
61
+
62
+ return this;
63
+ }
64
+
65
+ override async init() {
66
+ await this.walkForAssets();
67
+ await super.init();
68
+ }
69
+
70
+ async walkForAssets() {
71
+ const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
72
+
73
+ for (const file of files) {
74
+ if (file.type !== 'file') {
75
+ continue;
76
+ }
77
+ this.assets.set(file.relativePath.toString(), file);
78
+ }
79
+ }
80
+
81
+ makeAssetsServingController() {
82
+ return (req: Request, res: Response, next: NextFunction) => {
83
+ const requestPath = req.url;
84
+ const file = requestPath.slice(1);
85
+ if (!file) {
86
+ return next();
87
+ }
88
+
89
+ const asset = this.assets.get(file);
90
+ if (asset?.type !== 'file') {
91
+ return next();
92
+ }
93
+ res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
94
+ res.set('Content-Length', asset.stats.size.toString());
95
+ fs.createReadStream(asset.path).pipe(res);
96
+
97
+ return;
98
+ };
99
+ }
100
+
101
+ override listen(port: number) {
102
+ const r = super.listen(port);
103
+ if (this.httpAlternativeServer) {
104
+ const altPort = port + 1;
105
+ this.httpAlternativeServer.listen(altPort, () => {
106
+ this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
107
+ });
108
+ }
109
+
110
+ return r;
111
+ }
112
+
113
+ override registerRoutes(): void {
114
+
115
+ const openAPIManager = new OpenAPIManager();
116
+ openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!);
117
+ const openapiJsonPath = '/openapi.json';
118
+ this.expressRootRouter.get(openapiJsonPath, (req, res) => {
119
+ const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
120
+ baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
121
+ baseURL.search = '';
122
+ const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
123
+ info: {
124
+ title: this.registry.title,
125
+ description: `${this.registry.title} openAPI documentations`,
126
+ 'x-logo': {
127
+ url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
128
+ }
129
+ }
130
+ }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
131
+ res.statusCode = 200;
132
+ res.end(JSON.stringify(content));
133
+ });
134
+
135
+ this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('crawl'));
136
+ }
137
+
138
+ protected override featureSelect(): void {
139
+ this.insertAsyncHookMiddleware();
140
+ this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
141
+ this.insertLogRequestsMiddleware();
142
+ this.registerOpenAPIDocsRoutes('/docs');
143
+
144
+ this.registerRoutes();
145
+ }
146
+ }
147
+ const instance = container.resolve(CrawlStandAloneServer);
148
+
149
+ export default instance;
150
+
151
+ instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));
backend/functions/src/stand-alone/search.ts ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import 'reflect-metadata';
2
+ import { container, singleton } from 'tsyringe';
3
+ import { initializeApp, applicationDefault } from 'firebase-admin/app';
4
+
5
+ process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
6
+ projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
7
+ storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
8
+ credential: applicationDefault(),
9
+ });
10
+
11
+ initializeApp();
12
+
13
+
14
+ import { Logger, CloudFunctionRegistry } from '../shared';
15
+ import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
16
+ import { ExpressServer } from 'civkit/civ-rpc/express';
17
+ import http2 from 'http2';
18
+ import { SearcherHost } from '../cloud-functions/searcher';
19
+ import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
20
+ import path from 'path';
21
+ import fs from 'fs';
22
+ import { mimeOfExt } from 'civkit/mime';
23
+ import { NextFunction, Request, Response } from 'express';
24
+
25
+ process.on('unhandledRejection', (err) => {
26
+ console.error('Unhandled rejection', err);
27
+ });
28
+
29
+ process.on('uncaughtException', (err) => {
30
+ console.log('Uncaught exception', err);
31
+
32
+ // Looks like Firebase runtime does not handle error properly.
33
+ // Make sure to quit the process.
34
+ console.error('Uncaught exception, process quit.');
35
+ process.nextTick(() => process.exit(1));
36
+ });
37
+
38
+ @singleton()
39
+ export class SearchStandAloneServer extends ExpressServer {
40
+ logger = this.globalLogger.child({ service: this.constructor.name });
41
+
42
+ httpAlternativeServer?: typeof this['httpServer'];
43
+ assets = new Map<string, WalkOutEntity>();
44
+
45
+ constructor(
46
+ protected globalLogger: Logger,
47
+ protected registry: CloudFunctionRegistry,
48
+ protected searcherHost: SearcherHost,
49
+ ) {
50
+ super(...arguments);
51
+
52
+ registry.allHandsOnDeck().catch(() => void 0);
53
+ registry.title = 'reader';
54
+ registry.version = '0.1.0';
55
+ }
56
+
57
+ h2c() {
58
+ this.httpAlternativeServer = this.httpServer;
59
+ this.httpServer = http2.createServer(this.expressApp);
60
+ // useResourceBasedDefaultTracker();
61
+
62
+ return this;
63
+ }
64
+
65
+ override async init() {
66
+ await this.walkForAssets();
67
+ await super.init();
68
+ }
69
+
70
+ async walkForAssets() {
71
+ const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
72
+
73
+ for (const file of files) {
74
+ if (file.type !== 'file') {
75
+ continue;
76
+ }
77
+ this.assets.set(file.relativePath.toString(), file);
78
+ }
79
+ }
80
+
81
+ makeAssetsServingController() {
82
+ return (req: Request, res: Response, next: NextFunction) => {
83
+ const requestPath = req.url;
84
+ const file = requestPath.slice(1);
85
+ if (!file) {
86
+ return next();
87
+ }
88
+
89
+ const asset = this.assets.get(file);
90
+ if (asset?.type !== 'file') {
91
+ return next();
92
+ }
93
+ res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
94
+ res.set('Content-Length', asset.stats.size.toString());
95
+ fs.createReadStream(asset.path).pipe(res);
96
+
97
+ return;
98
+ };
99
+ }
100
+
101
+ override listen(port: number) {
102
+ const r = super.listen(port);
103
+ if (this.httpAlternativeServer) {
104
+ const altPort = port + 1;
105
+ this.httpAlternativeServer.listen(altPort, () => {
106
+ this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
107
+ });
108
+ }
109
+
110
+ return r;
111
+ }
112
+
113
+ override registerRoutes(): void {
114
+
115
+ const openAPIManager = new OpenAPIManager();
116
+ openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!);
117
+ const openapiJsonPath = '/openapi.json';
118
+ this.expressRootRouter.get(openapiJsonPath, (req, res) => {
119
+ const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
120
+ baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
121
+ baseURL.search = '';
122
+ const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
123
+ info: {
124
+ title: this.registry.title,
125
+ description: `${this.registry.title} openAPI documentations`,
126
+ 'x-logo': {
127
+ url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
128
+ }
129
+ }
130
+ }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
131
+ res.statusCode = 200;
132
+ res.end(JSON.stringify(content));
133
+ });
134
+
135
+ this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('search'));
136
+ }
137
+
138
+ protected override featureSelect(): void {
139
+ this.insertAsyncHookMiddleware();
140
+ this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
141
+ this.insertLogRequestsMiddleware();
142
+ this.registerOpenAPIDocsRoutes('/docs');
143
+
144
+ this.registerRoutes();
145
+ }
146
+ }
147
+ const instance = container.resolve(SearchStandAloneServer);
148
+
149
+ export default instance;
150
+
151
+ instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));