nomagick commited on
Commit
ebc0900
·
unverified ·
1 Parent(s): 9242bb3

fix: walk around locale setting bug

Browse files
backend/functions/package-lock.json CHANGED
@@ -15,7 +15,7 @@
15
  "archiver": "^6.0.1",
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
- "civkit": "^0.8.1-bb8d850",
19
  "core-js": "^3.37.1",
20
  "cors": "^2.8.5",
21
  "dayjs": "^1.11.9",
@@ -33,7 +33,7 @@
33
  "puppeteer": "^23.3.0",
34
  "puppeteer-extra": "^3.3.6",
35
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
36
- "puppeteer-extra-plugin-page-proxy": "^2.0.0",
37
  "puppeteer-extra-plugin-stealth": "^2.11.2",
38
  "puppeteer-page-proxy": "^1.3.0",
39
  "robots-parser": "^3.0.1",
@@ -3729,9 +3729,9 @@
3729
  }
3730
  },
3731
  "node_modules/civkit": {
3732
- "version": "0.8.1-bb8d850",
3733
- "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-bb8d850.tgz",
3734
- "integrity": "sha512-b80LGS/jnpPgEHlbPk9QyDqHAxmA/VH7pyc428HYXtcXQOiqhkMaFanOU78sCctxjpzYqnzOkNDwcPl1PdzgFw==",
3735
  "license": "AGPL",
3736
  "dependencies": {
3737
  "lodash": "^4.17.21",
@@ -3754,6 +3754,7 @@
3754
  "koa-compose": "^4.1.0",
3755
  "libmagic-ffi": "^0.1.4",
3756
  "mime": "^3.0.0",
 
3757
  "minio": "^7.0.33",
3758
  "node-object-hash": "^3.0.0",
3759
  "node-schedule": "^2.1.1",
@@ -3770,6 +3771,32 @@
3770
  "tsyringe": "^4"
3771
  }
3772
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3773
  "node_modules/cjs-module-lexer": {
3774
  "version": "1.2.3",
3775
  "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
@@ -9795,9 +9822,10 @@
9795
  }
9796
  },
9797
  "node_modules/puppeteer-extra-plugin-page-proxy": {
9798
- "version": "2.0.0",
9799
- "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz",
9800
- "integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==",
 
9801
  "dependencies": {
9802
  "debug": "^4.1.1",
9803
  "got": "^11.8.5",
 
15
  "archiver": "^6.0.1",
16
  "axios": "^1.3.3",
17
  "bcrypt": "^5.1.0",
18
+ "civkit": "^0.8.2-c9ca977",
19
  "core-js": "^3.37.1",
20
  "cors": "^2.8.5",
21
  "dayjs": "^1.11.9",
 
33
  "puppeteer": "^23.3.0",
34
  "puppeteer-extra": "^3.3.6",
35
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
36
+ "puppeteer-extra-plugin-page-proxy": "^1.3.1",
37
  "puppeteer-extra-plugin-stealth": "^2.11.2",
38
  "puppeteer-page-proxy": "^1.3.0",
39
  "robots-parser": "^3.0.1",
 
3729
  }
3730
  },
3731
  "node_modules/civkit": {
3732
+ "version": "0.8.2-c9ca977",
3733
+ "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-c9ca977.tgz",
3734
+ "integrity": "sha512-wuJ6zs88qx5WiIxSBErH3f8wBArSkT2goHHvfm5ZLqL17v6rkS4iQWx2+YhJfhmfBzqU8oAZI1QD9v2LY1awBg==",
3735
  "license": "AGPL",
3736
  "dependencies": {
3737
  "lodash": "^4.17.21",
 
3754
  "koa-compose": "^4.1.0",
3755
  "libmagic-ffi": "^0.1.4",
3756
  "mime": "^3.0.0",
3757
+ "minimatch": "^10.0.1",
3758
  "minio": "^7.0.33",
3759
  "node-object-hash": "^3.0.0",
3760
  "node-schedule": "^2.1.1",
 
3771
  "tsyringe": "^4"
3772
  }
3773
  },
3774
+ "node_modules/civkit/node_modules/brace-expansion": {
3775
+ "version": "2.0.1",
3776
+ "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
3777
+ "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
3778
+ "license": "MIT",
3779
+ "optional": true,
3780
+ "dependencies": {
3781
+ "balanced-match": "^1.0.0"
3782
+ }
3783
+ },
3784
+ "node_modules/civkit/node_modules/minimatch": {
3785
+ "version": "10.0.1",
3786
+ "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz",
3787
+ "integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==",
3788
+ "license": "ISC",
3789
+ "optional": true,
3790
+ "dependencies": {
3791
+ "brace-expansion": "^2.0.1"
3792
+ },
3793
+ "engines": {
3794
+ "node": "20 || >=22"
3795
+ },
3796
+ "funding": {
3797
+ "url": "https://github.com/sponsors/isaacs"
3798
+ }
3799
+ },
3800
  "node_modules/cjs-module-lexer": {
3801
  "version": "1.2.3",
3802
  "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
 
9822
  }
9823
  },
9824
  "node_modules/puppeteer-extra-plugin-page-proxy": {
9825
+ "version": "1.3.1",
9826
+ "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-1.3.1.tgz",
9827
+ "integrity": "sha512-5+mCJkJIsNHqryP8YhZMO+yYRSIfNMfyOuLFPlj4EtRGTNdly+jsOCwaizaMBuA/ItvXqWJ054KfqqOJKvuRMQ==",
9828
+ "license": "MIT",
9829
  "dependencies": {
9830
  "debug": "^4.1.1",
9831
  "got": "^11.8.5",
backend/functions/package.json CHANGED
@@ -35,7 +35,7 @@
35
  "archiver": "^6.0.1",
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
38
- "civkit": "^0.8.1-bb8d850",
39
  "core-js": "^3.37.1",
40
  "cors": "^2.8.5",
41
  "dayjs": "^1.11.9",
@@ -53,7 +53,7 @@
53
  "puppeteer": "^23.3.0",
54
  "puppeteer-extra": "^3.3.6",
55
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
56
- "puppeteer-extra-plugin-page-proxy": "^2.0.0",
57
  "puppeteer-extra-plugin-stealth": "^2.11.2",
58
  "puppeteer-page-proxy": "^1.3.0",
59
  "robots-parser": "^3.0.1",
 
35
  "archiver": "^6.0.1",
36
  "axios": "^1.3.3",
37
  "bcrypt": "^5.1.0",
38
+ "civkit": "^0.8.2-c9ca977",
39
  "core-js": "^3.37.1",
40
  "cors": "^2.8.5",
41
  "dayjs": "^1.11.9",
 
53
  "puppeteer": "^23.3.0",
54
  "puppeteer-extra": "^3.3.6",
55
  "puppeteer-extra-plugin-block-resources": "^2.4.3",
56
+ "puppeteer-extra-plugin-page-proxy": "^1.3.1",
57
  "puppeteer-extra-plugin-stealth": "^2.11.2",
58
  "puppeteer-page-proxy": "^1.3.0",
59
  "robots-parser": "^3.0.1",
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -692,6 +692,11 @@ export class CrawlerHost extends RPCHost {
692
  referer: opts.referer,
693
  };
694
 
 
 
 
 
 
695
  return crawlOpts;
696
  }
697
 
 
692
  referer: opts.referer,
693
  };
694
 
695
+ if (opts.locale) {
696
+ crawlOpts.extraHeaders ??= {};
697
+ crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
698
+ }
699
+
700
  return crawlOpts;
701
  }
702
 
backend/functions/src/services/puppeteer.ts CHANGED
@@ -46,7 +46,7 @@ export interface PageSnapshot {
46
  href: string;
47
  rebase?: string;
48
  html: string;
49
- shadowExpanded?: string
50
  text: string;
51
  status?: number;
52
  statusText?: string;
@@ -75,6 +75,7 @@ export interface ScrappingOptions {
75
  timeoutMs?: number;
76
  locale?: string;
77
  referer?: string;
 
78
  }
79
 
80
 
@@ -581,14 +582,34 @@ if (window.self === window.top) {
581
  pdfUrls.push(url);
582
  }
583
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
  const sn = this.snMap.get(page);
585
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
586
 
587
  this.logger.info(`Locale setting: ${options?.locale}`);
588
  if (options?.locale) {
589
- await page.setExtraHTTPHeaders({
590
- 'Accept-Language': options?.locale
591
- });
 
 
592
 
593
  await page.evaluateOnNewDocument(() => {
594
  Object.defineProperty(navigator, "language", {
@@ -605,7 +626,10 @@ if (window.self === window.top) {
605
  }
606
 
607
  if (options?.proxyUrl) {
608
- await page.useProxy(options.proxyUrl);
 
 
 
609
  }
610
  if (options?.cookies) {
611
  const mapped = options.cookies.map((x) => {
 
46
  href: string;
47
  rebase?: string;
48
  html: string;
49
+ shadowExpanded?: string;
50
  text: string;
51
  status?: number;
52
  statusText?: string;
 
75
  timeoutMs?: number;
76
  locale?: string;
77
  referer?: string;
78
+ extraHeaders?: Record<string, string>;
79
  }
80
 
81
 
 
582
  pdfUrls.push(url);
583
  }
584
  });
585
+ if (options?.extraHeaders) {
586
+ page.on('request', async (req) => {
587
+ if (req.isInterceptResolutionHandled()) {
588
+ return;
589
+ };
590
+
591
+ const overrides = req.continueRequestOverrides();
592
+ const continueArgs = [{
593
+ ...overrides,
594
+ headers: {
595
+ ...overrides?.headers,
596
+ ...options.extraHeaders,
597
+ }
598
+ }, 1] as const;
599
+
600
+ return req.continue(continueArgs[0], continueArgs[1]);
601
+ });
602
+ }
603
  const sn = this.snMap.get(page);
604
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
605
 
606
  this.logger.info(`Locale setting: ${options?.locale}`);
607
  if (options?.locale) {
608
+ // Add headers via request interception to walk around this bug
609
+ // https://github.com/puppeteer/puppeteer/issues/10235
610
+ // await page.setExtraHTTPHeaders({
611
+ // 'Accept-Language': options?.locale
612
+ // });
613
 
614
  await page.evaluateOnNewDocument(() => {
615
  Object.defineProperty(navigator, "language", {
 
626
  }
627
 
628
  if (options?.proxyUrl) {
629
+ await page.useProxy(options.proxyUrl, {
630
+ headers: options.extraHeaders,
631
+ interceptResolutionPriority: 2,
632
+ });
633
  }
634
  if (options?.cookies) {
635
  const mapped = options.cookies.map((x) => {