Spaces:
Build error
Build error
feat: expand shadow dom
Browse files- backend/functions/package-lock.json +17 -25
- backend/functions/package.json +2 -2
- backend/functions/src/cloud-functions/crawler.ts +3 -1
- backend/functions/src/dto/scrapping-options.ts +22 -0
- backend/functions/src/index.ts +1 -1
- backend/functions/src/services/jsdom.ts +26 -16
- backend/functions/src/services/puppeteer.ts +75 -0
- backend/functions/src/services/snapshot-formatter.ts +2 -2
- thinapps-shared +1 -1
backend/functions/package-lock.json
CHANGED
|
@@ -15,13 +15,13 @@
|
|
| 15 |
"archiver": "^6.0.1",
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
-
"civkit": "^0.8.
|
| 19 |
"core-js": "^3.37.1",
|
| 20 |
"cors": "^2.8.5",
|
| 21 |
"dayjs": "^1.11.9",
|
| 22 |
"express": "^4.19.2",
|
| 23 |
"firebase-admin": "^12.1.0",
|
| 24 |
-
"firebase-functions": "^6.
|
| 25 |
"htmlparser2": "^9.0.0",
|
| 26 |
"jose": "^5.1.0",
|
| 27 |
"langdetect": "^0.2.1",
|
|
@@ -2176,12 +2176,14 @@
|
|
| 2176 |
}
|
| 2177 |
},
|
| 2178 |
"node_modules/@types/express": {
|
| 2179 |
-
"version": "4.17.
|
| 2180 |
-
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.
|
| 2181 |
-
"integrity": "sha512-
|
|
|
|
| 2182 |
"dependencies": {
|
| 2183 |
"@types/body-parser": "*",
|
| 2184 |
-
"@types/express-serve-static-core": "
|
|
|
|
| 2185 |
"@types/serve-static": "*"
|
| 2186 |
}
|
| 2187 |
},
|
|
@@ -3727,9 +3729,10 @@
|
|
| 3727 |
}
|
| 3728 |
},
|
| 3729 |
"node_modules/civkit": {
|
| 3730 |
-
"version": "0.8.
|
| 3731 |
-
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.
|
| 3732 |
-
"integrity": "sha512-
|
|
|
|
| 3733 |
"dependencies": {
|
| 3734 |
"lodash": "^4.17.21",
|
| 3735 |
"tslib": "^2.5.0"
|
|
@@ -5510,15 +5513,15 @@
|
|
| 5510 |
}
|
| 5511 |
},
|
| 5512 |
"node_modules/firebase-functions": {
|
| 5513 |
-
"version": "6.
|
| 5514 |
-
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.
|
| 5515 |
-
"integrity": "sha512-
|
| 5516 |
"license": "MIT",
|
| 5517 |
"dependencies": {
|
| 5518 |
"@types/cors": "^2.8.5",
|
| 5519 |
-
"@types/express": "4.17.
|
| 5520 |
"cors": "^2.8.5",
|
| 5521 |
-
"express": "^4.
|
| 5522 |
"protobufjs": "^7.2.2"
|
| 5523 |
},
|
| 5524 |
"bin": {
|
|
@@ -7848,17 +7851,6 @@
|
|
| 7848 |
"node": ">=14"
|
| 7849 |
}
|
| 7850 |
},
|
| 7851 |
-
"node_modules/jwks-rsa/node_modules/@types/express": {
|
| 7852 |
-
"version": "4.17.21",
|
| 7853 |
-
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
|
| 7854 |
-
"integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
|
| 7855 |
-
"dependencies": {
|
| 7856 |
-
"@types/body-parser": "*",
|
| 7857 |
-
"@types/express-serve-static-core": "^4.17.33",
|
| 7858 |
-
"@types/qs": "*",
|
| 7859 |
-
"@types/serve-static": "*"
|
| 7860 |
-
}
|
| 7861 |
-
},
|
| 7862 |
"node_modules/jwks-rsa/node_modules/jose": {
|
| 7863 |
"version": "4.15.5",
|
| 7864 |
"resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz",
|
|
|
|
| 15 |
"archiver": "^6.0.1",
|
| 16 |
"axios": "^1.3.3",
|
| 17 |
"bcrypt": "^5.1.0",
|
| 18 |
+
"civkit": "^0.8.1-1f42c5a",
|
| 19 |
"core-js": "^3.37.1",
|
| 20 |
"cors": "^2.8.5",
|
| 21 |
"dayjs": "^1.11.9",
|
| 22 |
"express": "^4.19.2",
|
| 23 |
"firebase-admin": "^12.1.0",
|
| 24 |
+
"firebase-functions": "^6.1.0",
|
| 25 |
"htmlparser2": "^9.0.0",
|
| 26 |
"jose": "^5.1.0",
|
| 27 |
"langdetect": "^0.2.1",
|
|
|
|
| 2176 |
}
|
| 2177 |
},
|
| 2178 |
"node_modules/@types/express": {
|
| 2179 |
+
"version": "4.17.21",
|
| 2180 |
+
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
|
| 2181 |
+
"integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
|
| 2182 |
+
"license": "MIT",
|
| 2183 |
"dependencies": {
|
| 2184 |
"@types/body-parser": "*",
|
| 2185 |
+
"@types/express-serve-static-core": "^4.17.33",
|
| 2186 |
+
"@types/qs": "*",
|
| 2187 |
"@types/serve-static": "*"
|
| 2188 |
}
|
| 2189 |
},
|
|
|
|
| 3729 |
}
|
| 3730 |
},
|
| 3731 |
"node_modules/civkit": {
|
| 3732 |
+
"version": "0.8.1-1f42c5a",
|
| 3733 |
+
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-1f42c5a.tgz",
|
| 3734 |
+
"integrity": "sha512-+cXywfdiu9+QbnNmJXKCjiAdEUdGRiiZ8zg/YKRqsr4vaX6lFNEI3P0J1FOj1x3vRL9cESGucXN6rh0AfmHHTQ==",
|
| 3735 |
+
"license": "AGPL",
|
| 3736 |
"dependencies": {
|
| 3737 |
"lodash": "^4.17.21",
|
| 3738 |
"tslib": "^2.5.0"
|
|
|
|
| 5513 |
}
|
| 5514 |
},
|
| 5515 |
"node_modules/firebase-functions": {
|
| 5516 |
+
"version": "6.1.0",
|
| 5517 |
+
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz",
|
| 5518 |
+
"integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==",
|
| 5519 |
"license": "MIT",
|
| 5520 |
"dependencies": {
|
| 5521 |
"@types/cors": "^2.8.5",
|
| 5522 |
+
"@types/express": "^4.17.21",
|
| 5523 |
"cors": "^2.8.5",
|
| 5524 |
+
"express": "^4.21.0",
|
| 5525 |
"protobufjs": "^7.2.2"
|
| 5526 |
},
|
| 5527 |
"bin": {
|
|
|
|
| 7851 |
"node": ">=14"
|
| 7852 |
}
|
| 7853 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7854 |
"node_modules/jwks-rsa/node_modules/jose": {
|
| 7855 |
"version": "4.15.5",
|
| 7856 |
"resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz",
|
backend/functions/package.json
CHANGED
|
@@ -35,13 +35,13 @@
|
|
| 35 |
"archiver": "^6.0.1",
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
-
"civkit": "^0.8.
|
| 39 |
"core-js": "^3.37.1",
|
| 40 |
"cors": "^2.8.5",
|
| 41 |
"dayjs": "^1.11.9",
|
| 42 |
"express": "^4.19.2",
|
| 43 |
"firebase-admin": "^12.1.0",
|
| 44 |
-
"firebase-functions": "^6.
|
| 45 |
"htmlparser2": "^9.0.0",
|
| 46 |
"jose": "^5.1.0",
|
| 47 |
"langdetect": "^0.2.1",
|
|
|
|
| 35 |
"archiver": "^6.0.1",
|
| 36 |
"axios": "^1.3.3",
|
| 37 |
"bcrypt": "^5.1.0",
|
| 38 |
+
"civkit": "^0.8.1-1f42c5a",
|
| 39 |
"core-js": "^3.37.1",
|
| 40 |
"cors": "^2.8.5",
|
| 41 |
"dayjs": "^1.11.9",
|
| 42 |
"express": "^4.19.2",
|
| 43 |
"firebase-admin": "^12.1.0",
|
| 44 |
+
"firebase-functions": "^6.1.0",
|
| 45 |
"htmlparser2": "^9.0.0",
|
| 46 |
"jose": "^5.1.0",
|
| 47 |
"langdetect": "^0.2.1",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -24,6 +24,7 @@ import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapsho
|
|
| 24 |
|
| 25 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 26 |
withIframe?: boolean;
|
|
|
|
| 27 |
targetSelector?: string | string[];
|
| 28 |
removeSelector?: string | string[];
|
| 29 |
keepImgDataUrl?: boolean;
|
|
@@ -571,7 +572,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 571 |
}
|
| 572 |
|
| 573 |
try {
|
| 574 |
-
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
|
| 575 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 576 |
yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
|
| 577 |
}
|
|
@@ -686,6 +687,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 686 |
overrideUserAgent: opts.userAgent,
|
| 687 |
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
| 688 |
withIframe: opts.withIframe,
|
|
|
|
| 689 |
locale: opts.locale,
|
| 690 |
referer: opts.referer,
|
| 691 |
};
|
|
|
|
| 24 |
|
| 25 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 26 |
withIframe?: boolean;
|
| 27 |
+
withShadowDom?: boolean;
|
| 28 |
targetSelector?: string | string[];
|
| 29 |
removeSelector?: string | string[];
|
| 30 |
keepImgDataUrl?: boolean;
|
|
|
|
| 572 |
}
|
| 573 |
|
| 574 |
try {
|
| 575 |
+
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
|
| 576 |
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
| 577 |
yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
|
| 578 |
}
|
|
|
|
| 687 |
overrideUserAgent: opts.userAgent,
|
| 688 |
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
| 689 |
withIframe: opts.withIframe,
|
| 690 |
+
withShadowDom: opts.withShadowDom,
|
| 691 |
locale: opts.locale,
|
| 692 |
referer: opts.referer,
|
| 693 |
};
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -101,6 +101,16 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
| 101 |
in: 'header',
|
| 102 |
schema: { type: 'string' }
|
| 103 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
'X-User-Agent': {
|
| 105 |
description: `Override User-Agent.`,
|
| 106 |
in: 'header',
|
|
@@ -185,6 +195,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 185 |
})
|
| 186 |
withIframe!: boolean;
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
@Prop({
|
| 189 |
arrayOf: String,
|
| 190 |
})
|
|
@@ -283,6 +298,13 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 283 |
if (instance.withIframe) {
|
| 284 |
instance.timeout ??= null;
|
| 285 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
const cookies: CookieParam[] = [];
|
| 288 |
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
|
|
|
| 101 |
in: 'header',
|
| 102 |
schema: { type: 'string' }
|
| 103 |
},
|
| 104 |
+
'X-With-Iframe': {
|
| 105 |
+
description: `Enable filling iframe contents into main. (violates standards)`,
|
| 106 |
+
in: 'header',
|
| 107 |
+
schema: { type: 'string' }
|
| 108 |
+
},
|
| 109 |
+
'X-With-Shadow-Dom': {
|
| 110 |
+
description: `Enable filling shadow dom contents into main. (violates standards)`,
|
| 111 |
+
in: 'header',
|
| 112 |
+
schema: { type: 'string' }
|
| 113 |
+
},
|
| 114 |
'X-User-Agent': {
|
| 115 |
description: `Override User-Agent.`,
|
| 116 |
in: 'header',
|
|
|
|
| 195 |
})
|
| 196 |
withIframe!: boolean;
|
| 197 |
|
| 198 |
+
@Prop({
|
| 199 |
+
default: false,
|
| 200 |
+
})
|
| 201 |
+
withShadowDom!: boolean;
|
| 202 |
+
|
| 203 |
@Prop({
|
| 204 |
arrayOf: String,
|
| 205 |
})
|
|
|
|
| 298 |
if (instance.withIframe) {
|
| 299 |
instance.timeout ??= null;
|
| 300 |
}
|
| 301 |
+
const withShadowDom = ctx?.req.get('x-with-shadow-dom');
|
| 302 |
+
if (withShadowDom) {
|
| 303 |
+
instance.withShadowDom = Boolean(withShadowDom);
|
| 304 |
+
}
|
| 305 |
+
if (instance.withShadowDom) {
|
| 306 |
+
instance.timeout ??= null;
|
| 307 |
+
}
|
| 308 |
|
| 309 |
const cookies: CookieParam[] = [];
|
| 310 |
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
backend/functions/src/index.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import 'reflect-metadata';
|
| 2 |
-
import './shared/lib/doom-domain';
|
| 3 |
import { initializeApp } from 'firebase-admin/app';
|
| 4 |
initializeApp();
|
| 5 |
|
|
|
|
| 1 |
import 'reflect-metadata';
|
| 2 |
+
// import './shared/lib/doom-domain';
|
| 3 |
import { initializeApp } from 'firebase-admin/app';
|
| 4 |
initializeApp();
|
| 5 |
|
backend/functions/src/services/jsdom.ts
CHANGED
|
@@ -5,6 +5,7 @@ import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
|
| 5 |
import { Readability } from '@mozilla/readability';
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../shared/services/threaded';
|
|
|
|
| 8 |
|
| 9 |
const pLinkedom = import('linkedom');
|
| 10 |
|
|
@@ -27,12 +28,8 @@ export class JSDomControl extends AsyncService {
|
|
| 27 |
this.emit('ready');
|
| 28 |
}
|
| 29 |
|
| 30 |
-
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
| 31 |
-
targetSelector?
|
| 32 |
-
removeSelector?: string | string[];
|
| 33 |
-
withIframe?: boolean;
|
| 34 |
-
}) {
|
| 35 |
-
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
| 36 |
return snapshot;
|
| 37 |
}
|
| 38 |
if (!snapshot?.html) {
|
|
@@ -43,14 +40,13 @@ export class JSDomControl extends AsyncService {
|
|
| 43 |
}
|
| 44 |
|
| 45 |
@Threaded()
|
| 46 |
-
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
|
| 47 |
-
targetSelector?: string | string[];
|
| 48 |
-
removeSelector?: string | string[];
|
| 49 |
-
withIframe?: boolean;
|
| 50 |
-
}): Promise<PageSnapshot | undefined> {
|
| 51 |
-
|
| 52 |
const t0 = Date.now();
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
const allNodes: Node[] = [];
|
| 55 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 56 |
if (options?.withIframe) {
|
|
@@ -107,12 +103,12 @@ export class JSDomControl extends AsyncService {
|
|
| 107 |
|
| 108 |
return snapshot;
|
| 109 |
}
|
| 110 |
-
const
|
| 111 |
let rootDoc: Document;
|
| 112 |
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
| 113 |
rootDoc = allNodes[0] as any;
|
| 114 |
if (rootDoc.body.innerText) {
|
| 115 |
-
|
| 116 |
}
|
| 117 |
} else {
|
| 118 |
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
|
|
@@ -120,10 +116,16 @@ export class JSDomControl extends AsyncService {
|
|
| 120 |
rootDoc.body.appendChild(n);
|
| 121 |
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
| 122 |
if ((n as HTMLElement).innerText) {
|
| 123 |
-
|
| 124 |
}
|
| 125 |
}
|
| 126 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
let parsed;
|
| 129 |
try {
|
|
@@ -229,6 +231,14 @@ export class JSDomControl extends AsyncService {
|
|
| 229 |
snippetToElement(snippet?: string, url?: string) {
|
| 230 |
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
return parsed.window.document.documentElement;
|
| 233 |
}
|
| 234 |
|
|
|
|
| 5 |
import { Readability } from '@mozilla/readability';
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../shared/services/threaded';
|
| 8 |
+
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
|
| 9 |
|
| 10 |
const pLinkedom = import('linkedom');
|
| 11 |
|
|
|
|
| 28 |
this.emit('ready');
|
| 29 |
}
|
| 30 |
|
| 31 |
+
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: ExtraScrappingOptions) {
|
| 32 |
+
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe && !options?.withShadowDom) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return snapshot;
|
| 34 |
}
|
| 35 |
if (!snapshot?.html) {
|
|
|
|
| 40 |
}
|
| 41 |
|
| 42 |
@Threaded()
|
| 43 |
+
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: ExtraScrappingOptions): Promise<PageSnapshot | undefined> {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
const t0 = Date.now();
|
| 45 |
+
let sourceHTML = snapshot.html;
|
| 46 |
+
if (options?.withShadowDom && snapshot.shadowExpanded) {
|
| 47 |
+
sourceHTML = snapshot.shadowExpanded;
|
| 48 |
+
}
|
| 49 |
+
const jsdom = this.linkedom.parseHTML(sourceHTML);
|
| 50 |
const allNodes: Node[] = [];
|
| 51 |
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
| 52 |
if (options?.withIframe) {
|
|
|
|
| 103 |
|
| 104 |
return snapshot;
|
| 105 |
}
|
| 106 |
+
const textNodes: HTMLElement[] = [];
|
| 107 |
let rootDoc: Document;
|
| 108 |
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
| 109 |
rootDoc = allNodes[0] as any;
|
| 110 |
if (rootDoc.body.innerText) {
|
| 111 |
+
textNodes.push(rootDoc.body);
|
| 112 |
}
|
| 113 |
} else {
|
| 114 |
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
|
|
|
|
| 116 |
rootDoc.body.appendChild(n);
|
| 117 |
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
| 118 |
if ((n as HTMLElement).innerText) {
|
| 119 |
+
textNodes.push(n as HTMLElement);
|
| 120 |
}
|
| 121 |
}
|
| 122 |
}
|
| 123 |
+
const textChunks = textNodes.map((x) => {
|
| 124 |
+
const clone = x.cloneNode(true) as HTMLElement;
|
| 125 |
+
clone.querySelectorAll('script,style,link,svg').forEach((s) => s.remove());
|
| 126 |
+
|
| 127 |
+
return clone.innerText;
|
| 128 |
+
});
|
| 129 |
|
| 130 |
let parsed;
|
| 131 |
try {
|
|
|
|
| 231 |
snippetToElement(snippet?: string, url?: string) {
|
| 232 |
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
| 233 |
|
| 234 |
+
// Hack for turndown gfm table plugin.
|
| 235 |
+
parsed.window.document.querySelectorAll('table').forEach((x) => {
|
| 236 |
+
Object.defineProperty(x, 'rows', { value: Array.from(x.querySelectorAll('tr')), enumerable: true });
|
| 237 |
+
});
|
| 238 |
+
Object.defineProperty(parsed.window.document.documentElement, 'cloneNode', {
|
| 239 |
+
value: function () { return this; },
|
| 240 |
+
});
|
| 241 |
+
|
| 242 |
return parsed.window.document.documentElement;
|
| 243 |
}
|
| 244 |
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -46,6 +46,7 @@ export interface PageSnapshot {
|
|
| 46 |
href: string;
|
| 47 |
rebase?: string;
|
| 48 |
html: string;
|
|
|
|
| 49 |
text: string;
|
| 50 |
status?: number;
|
| 51 |
statusText?: string;
|
|
@@ -157,6 +158,79 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
|
|
| 157 |
};
|
| 158 |
}
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
function giveSnapshot(stopActiveSnapshot) {
|
| 161 |
if (stopActiveSnapshot) {
|
| 162 |
window.haltSnapshot = true;
|
|
@@ -174,6 +248,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 174 |
href: document.location.href,
|
| 175 |
html: document.documentElement?.outerHTML,
|
| 176 |
text: document.body?.innerText,
|
|
|
|
| 177 |
parsed: parsed,
|
| 178 |
imgs: [],
|
| 179 |
maxElemDepth: domAnalysis.maxDepth,
|
|
|
|
| 46 |
href: string;
|
| 47 |
rebase?: string;
|
| 48 |
html: string;
|
| 49 |
+
shadowExpanded?: string
|
| 50 |
text: string;
|
| 51 |
status?: number;
|
| 52 |
statusText?: string;
|
|
|
|
| 158 |
};
|
| 159 |
}
|
| 160 |
|
| 161 |
+
function cloneAndExpandShadowRoots(rootElement = document.documentElement) {
|
| 162 |
+
// Create a shallow clone of the root element
|
| 163 |
+
const clone = rootElement.cloneNode(false);
|
| 164 |
+
// Function to process an element and its shadow root
|
| 165 |
+
function processShadowRoot(original, cloned) {
|
| 166 |
+
if (original.shadowRoot && original.shadowRoot.mode === 'open') {
|
| 167 |
+
shadowDomPresents = true;
|
| 168 |
+
const shadowContent = document.createDocumentFragment();
|
| 169 |
+
|
| 170 |
+
// Clone shadow root content normally
|
| 171 |
+
original.shadowRoot.childNodes.forEach(childNode => {
|
| 172 |
+
const clonedNode = childNode.cloneNode(true);
|
| 173 |
+
shadowContent.appendChild(clonedNode);
|
| 174 |
+
});
|
| 175 |
+
|
| 176 |
+
// Handle slots
|
| 177 |
+
const slots = shadowContent.querySelectorAll('slot');
|
| 178 |
+
slots.forEach(slot => {
|
| 179 |
+
const slotName = slot.getAttribute('name') || '';
|
| 180 |
+
const assignedElements = original.querySelectorAll(
|
| 181 |
+
slotName ? \`[slot="\${slotName}"]\` : ':not([slot])'
|
| 182 |
+
);
|
| 183 |
+
|
| 184 |
+
if (assignedElements.length > 0) {
|
| 185 |
+
const slotContent = document.createDocumentFragment();
|
| 186 |
+
assignedElements.forEach(el => {
|
| 187 |
+
const clonedEl = el.cloneNode(true);
|
| 188 |
+
slotContent.appendChild(clonedEl);
|
| 189 |
+
});
|
| 190 |
+
slot.parentNode.replaceChild(slotContent, slot);
|
| 191 |
+
} else if (!slotName) {
|
| 192 |
+
// Keep default slot content
|
| 193 |
+
// No need to do anything as it's already cloned
|
| 194 |
+
}
|
| 195 |
+
});
|
| 196 |
+
|
| 197 |
+
cloned.appendChild(shadowContent);
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
// Use a TreeWalker on the original root to clone the entire structure
|
| 202 |
+
const treeWalker = document.createTreeWalker(
|
| 203 |
+
rootElement,
|
| 204 |
+
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT
|
| 205 |
+
);
|
| 206 |
+
|
| 207 |
+
const elementMap = new Map([[rootElement, clone]]);
|
| 208 |
+
|
| 209 |
+
let currentNode;
|
| 210 |
+
while (currentNode = treeWalker.nextNode()) {
|
| 211 |
+
const parentClone = elementMap.get(currentNode.parentNode);
|
| 212 |
+
const clonedNode = currentNode.cloneNode(false);
|
| 213 |
+
parentClone.appendChild(clonedNode);
|
| 214 |
+
|
| 215 |
+
if (currentNode.nodeType === Node.ELEMENT_NODE) {
|
| 216 |
+
elementMap.set(currentNode, clonedNode);
|
| 217 |
+
processShadowRoot(currentNode, clonedNode);
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
return clone;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
function shadowDomPresent(rootElement = document.documentElement) {
|
| 225 |
+
const elems = rootElement.querySelectorAll('*');
|
| 226 |
+
for (const x of elems) {
|
| 227 |
+
if (x.shadowRoot && x.shadowRoot.mode === 'open') {
|
| 228 |
+
return true;
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
return false;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
function giveSnapshot(stopActiveSnapshot) {
|
| 235 |
if (stopActiveSnapshot) {
|
| 236 |
window.haltSnapshot = true;
|
|
|
|
| 248 |
href: document.location.href,
|
| 249 |
html: document.documentElement?.outerHTML,
|
| 250 |
text: document.body?.innerText,
|
| 251 |
+
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
| 252 |
parsed: parsed,
|
| 253 |
imgs: [],
|
| 254 |
maxElemDepth: domAnalysis.maxDepth,
|
backend/functions/src/services/snapshot-formatter.ts
CHANGED
|
@@ -299,12 +299,12 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 299 |
&& toBeTurnedToMd !== jsDomElementOfHTML
|
| 300 |
) {
|
| 301 |
try {
|
| 302 |
-
contentText = this.jsdomControl.runTurndown(turnDownService,
|
| 303 |
} catch (err) {
|
| 304 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 305 |
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 306 |
try {
|
| 307 |
-
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService,
|
| 308 |
} catch (err2) {
|
| 309 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 310 |
}
|
|
|
|
| 299 |
&& toBeTurnedToMd !== jsDomElementOfHTML
|
| 300 |
) {
|
| 301 |
try {
|
| 302 |
+
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
| 303 |
} catch (err) {
|
| 304 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 305 |
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
| 306 |
try {
|
| 307 |
+
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML);
|
| 308 |
} catch (err2) {
|
| 309 |
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
| 310 |
}
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit fecbdd92230de5ebd0de168b43b0358d8221769f
|