Spaces:
Build error
Build error
fix: walk around locale setting bug
Browse files
backend/functions/package-lock.json
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
"archiver": "^6.0.1",
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
-
"civkit": "^0.8.
|
| 19 |
"core-js": "^3.37.1",
|
| 20 |
"cors": "^2.8.5",
|
| 21 |
"dayjs": "^1.11.9",
|
|
@@ -33,7 +33,7 @@
|
|
| 33 |
"puppeteer": "^23.3.0",
|
| 34 |
"puppeteer-extra": "^3.3.6",
|
| 35 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 36 |
-
"puppeteer-extra-plugin-page-proxy": "^
|
| 37 |
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
| 38 |
"puppeteer-page-proxy": "^1.3.0",
|
| 39 |
"robots-parser": "^3.0.1",
|
|
@@ -3729,9 +3729,9 @@
|
|
| 3729 |
}
|
| 3730 |
},
|
| 3731 |
"node_modules/civkit": {
|
| 3732 |
-
"version": "0.8.
|
| 3733 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.
|
| 3734 |
-
"integrity": "sha512-
|
| 3735 |
"license": "AGPL",
|
| 3736 |
"dependencies": {
|
| 3737 |
"lodash": "^4.17.21",
|
|
@@ -3754,6 +3754,7 @@
|
|
| 3754 |
"koa-compose": "^4.1.0",
|
| 3755 |
"libmagic-ffi": "^0.1.4",
|
| 3756 |
"mime": "^3.0.0",
|
|
|
|
| 3757 |
"minio": "^7.0.33",
|
| 3758 |
"node-object-hash": "^3.0.0",
|
| 3759 |
"node-schedule": "^2.1.1",
|
|
@@ -3770,6 +3771,32 @@
|
|
| 3770 |
"tsyringe": "^4"
|
| 3771 |
}
|
| 3772 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3773 |
"node_modules/cjs-module-lexer": {
|
| 3774 |
"version": "1.2.3",
|
| 3775 |
"resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
|
|
@@ -9795,9 +9822,10 @@
|
|
| 9795 |
}
|
| 9796 |
},
|
| 9797 |
"node_modules/puppeteer-extra-plugin-page-proxy": {
|
| 9798 |
-
"version": "
|
| 9799 |
-
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-
|
| 9800 |
-
"integrity": "sha512-
|
|
|
|
| 9801 |
"dependencies": {
|
| 9802 |
"debug": "^4.1.1",
|
| 9803 |
"got": "^11.8.5",
|
|
|
|
| 15 |
"archiver": "^6.0.1",
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
+
"civkit": "^0.8.2-c9ca977",
|
| 19 |
"core-js": "^3.37.1",
|
| 20 |
"cors": "^2.8.5",
|
| 21 |
"dayjs": "^1.11.9",
|
|
|
|
| 33 |
"puppeteer": "^23.3.0",
|
| 34 |
"puppeteer-extra": "^3.3.6",
|
| 35 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 36 |
+
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
| 37 |
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
| 38 |
"puppeteer-page-proxy": "^1.3.0",
|
| 39 |
"robots-parser": "^3.0.1",
|
|
|
|
| 3729 |
}
|
| 3730 |
},
|
| 3731 |
"node_modules/civkit": {
|
| 3732 |
+
"version": "0.8.2-c9ca977",
|
| 3733 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-c9ca977.tgz",
|
| 3734 |
+
"integrity": "sha512-wuJ6zs88qx5WiIxSBErH3f8wBArSkT2goHHvfm5ZLqL17v6rkS4iQWx2+YhJfhmfBzqU8oAZI1QD9v2LY1awBg==",
|
| 3735 |
"license": "AGPL",
|
| 3736 |
"dependencies": {
|
| 3737 |
"lodash": "^4.17.21",
|
|
|
|
| 3754 |
"koa-compose": "^4.1.0",
|
| 3755 |
"libmagic-ffi": "^0.1.4",
|
| 3756 |
"mime": "^3.0.0",
|
| 3757 |
+
"minimatch": "^10.0.1",
|
| 3758 |
"minio": "^7.0.33",
|
| 3759 |
"node-object-hash": "^3.0.0",
|
| 3760 |
"node-schedule": "^2.1.1",
|
|
|
|
| 3771 |
"tsyringe": "^4"
|
| 3772 |
}
|
| 3773 |
},
|
| 3774 |
+
"node_modules/civkit/node_modules/brace-expansion": {
|
| 3775 |
+
"version": "2.0.1",
|
| 3776 |
+
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
| 3777 |
+
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
| 3778 |
+
"license": "MIT",
|
| 3779 |
+
"optional": true,
|
| 3780 |
+
"dependencies": {
|
| 3781 |
+
"balanced-match": "^1.0.0"
|
| 3782 |
+
}
|
| 3783 |
+
},
|
| 3784 |
+
"node_modules/civkit/node_modules/minimatch": {
|
| 3785 |
+
"version": "10.0.1",
|
| 3786 |
+
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz",
|
| 3787 |
+
"integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==",
|
| 3788 |
+
"license": "ISC",
|
| 3789 |
+
"optional": true,
|
| 3790 |
+
"dependencies": {
|
| 3791 |
+
"brace-expansion": "^2.0.1"
|
| 3792 |
+
},
|
| 3793 |
+
"engines": {
|
| 3794 |
+
"node": "20 || >=22"
|
| 3795 |
+
},
|
| 3796 |
+
"funding": {
|
| 3797 |
+
"url": "https://github.com/sponsors/isaacs"
|
| 3798 |
+
}
|
| 3799 |
+
},
|
| 3800 |
"node_modules/cjs-module-lexer": {
|
| 3801 |
"version": "1.2.3",
|
| 3802 |
"resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.2.3.tgz",
|
|
|
|
| 9822 |
}
|
| 9823 |
},
|
| 9824 |
"node_modules/puppeteer-extra-plugin-page-proxy": {
|
| 9825 |
+
"version": "1.3.1",
|
| 9826 |
+
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-1.3.1.tgz",
|
| 9827 |
+
"integrity": "sha512-5+mCJkJIsNHqryP8YhZMO+yYRSIfNMfyOuLFPlj4EtRGTNdly+jsOCwaizaMBuA/ItvXqWJ054KfqqOJKvuRMQ==",
|
| 9828 |
+
"license": "MIT",
|
| 9829 |
"dependencies": {
|
| 9830 |
"debug": "^4.1.1",
|
| 9831 |
"got": "^11.8.5",
|
backend/functions/package.json
CHANGED
|
@@ -35,7 +35,7 @@
|
|
| 35 |
"archiver": "^6.0.1",
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
-
"civkit": "^0.8.
|
| 39 |
"core-js": "^3.37.1",
|
| 40 |
"cors": "^2.8.5",
|
| 41 |
"dayjs": "^1.11.9",
|
|
@@ -53,7 +53,7 @@
|
|
| 53 |
"puppeteer": "^23.3.0",
|
| 54 |
"puppeteer-extra": "^3.3.6",
|
| 55 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 56 |
-
"puppeteer-extra-plugin-page-proxy": "^
|
| 57 |
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
| 58 |
"puppeteer-page-proxy": "^1.3.0",
|
| 59 |
"robots-parser": "^3.0.1",
|
|
|
|
| 35 |
"archiver": "^6.0.1",
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
+
"civkit": "^0.8.2-c9ca977",
|
| 39 |
"core-js": "^3.37.1",
|
| 40 |
"cors": "^2.8.5",
|
| 41 |
"dayjs": "^1.11.9",
|
|
|
|
| 53 |
"puppeteer": "^23.3.0",
|
| 54 |
"puppeteer-extra": "^3.3.6",
|
| 55 |
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
| 56 |
+
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
| 57 |
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
| 58 |
"puppeteer-page-proxy": "^1.3.0",
|
| 59 |
"robots-parser": "^3.0.1",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -692,6 +692,11 @@ export class CrawlerHost extends RPCHost {
|
|
| 692 |
referer: opts.referer,
|
| 693 |
};
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
return crawlOpts;
|
| 696 |
}
|
| 697 |
|
|
|
|
| 692 |
referer: opts.referer,
|
| 693 |
};
|
| 694 |
|
| 695 |
+
if (opts.locale) {
|
| 696 |
+
crawlOpts.extraHeaders ??= {};
|
| 697 |
+
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
return crawlOpts;
|
| 701 |
}
|
| 702 |
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -46,7 +46,7 @@ export interface PageSnapshot {
|
|
| 46 |
href: string;
|
| 47 |
rebase?: string;
|
| 48 |
html: string;
|
| 49 |
-
shadowExpanded?: string
|
| 50 |
text: string;
|
| 51 |
status?: number;
|
| 52 |
statusText?: string;
|
|
@@ -75,6 +75,7 @@ export interface ScrappingOptions {
|
|
| 75 |
timeoutMs?: number;
|
| 76 |
locale?: string;
|
| 77 |
referer?: string;
|
|
|
|
| 78 |
}
|
| 79 |
|
| 80 |
|
|
@@ -581,14 +582,34 @@ if (window.self === window.top) {
|
|
| 581 |
pdfUrls.push(url);
|
| 582 |
}
|
| 583 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
const sn = this.snMap.get(page);
|
| 585 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 586 |
|
| 587 |
this.logger.info(`Locale setting: ${options?.locale}`);
|
| 588 |
if (options?.locale) {
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
|
|
|
|
|
|
| 592 |
|
| 593 |
await page.evaluateOnNewDocument(() => {
|
| 594 |
Object.defineProperty(navigator, "language", {
|
|
@@ -605,7 +626,10 @@ if (window.self === window.top) {
|
|
| 605 |
}
|
| 606 |
|
| 607 |
if (options?.proxyUrl) {
|
| 608 |
-
await page.useProxy(options.proxyUrl
|
|
|
|
|
|
|
|
|
|
| 609 |
}
|
| 610 |
if (options?.cookies) {
|
| 611 |
const mapped = options.cookies.map((x) => {
|
|
|
|
| 46 |
href: string;
|
| 47 |
rebase?: string;
|
| 48 |
html: string;
|
| 49 |
+
shadowExpanded?: string;
|
| 50 |
text: string;
|
| 51 |
status?: number;
|
| 52 |
statusText?: string;
|
|
|
|
| 75 |
timeoutMs?: number;
|
| 76 |
locale?: string;
|
| 77 |
referer?: string;
|
| 78 |
+
extraHeaders?: Record<string, string>;
|
| 79 |
}
|
| 80 |
|
| 81 |
|
|
|
|
| 582 |
pdfUrls.push(url);
|
| 583 |
}
|
| 584 |
});
|
| 585 |
+
if (options?.extraHeaders) {
|
| 586 |
+
page.on('request', async (req) => {
|
| 587 |
+
if (req.isInterceptResolutionHandled()) {
|
| 588 |
+
return;
|
| 589 |
+
};
|
| 590 |
+
|
| 591 |
+
const overrides = req.continueRequestOverrides();
|
| 592 |
+
const continueArgs = [{
|
| 593 |
+
...overrides,
|
| 594 |
+
headers: {
|
| 595 |
+
...overrides?.headers,
|
| 596 |
+
...options.extraHeaders,
|
| 597 |
+
}
|
| 598 |
+
}, 1] as const;
|
| 599 |
+
|
| 600 |
+
return req.continue(continueArgs[0], continueArgs[1]);
|
| 601 |
+
});
|
| 602 |
+
}
|
| 603 |
const sn = this.snMap.get(page);
|
| 604 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 605 |
|
| 606 |
this.logger.info(`Locale setting: ${options?.locale}`);
|
| 607 |
if (options?.locale) {
|
| 608 |
+
// Add headers via request interception to walk around this bug
|
| 609 |
+
// https://github.com/puppeteer/puppeteer/issues/10235
|
| 610 |
+
// await page.setExtraHTTPHeaders({
|
| 611 |
+
// 'Accept-Language': options?.locale
|
| 612 |
+
// });
|
| 613 |
|
| 614 |
await page.evaluateOnNewDocument(() => {
|
| 615 |
Object.defineProperty(navigator, "language", {
|
|
|
|
| 626 |
}
|
| 627 |
|
| 628 |
if (options?.proxyUrl) {
|
| 629 |
+
await page.useProxy(options.proxyUrl, {
|
| 630 |
+
headers: options.extraHeaders,
|
| 631 |
+
interceptResolutionPriority: 2,
|
| 632 |
+
});
|
| 633 |
}
|
| 634 |
if (options?.cookies) {
|
| 635 |
const mapped = options.cookies.map((x) => {
|