nomagick commited on
Commit
57cbae8
·
unverified ·
1 Parent(s): 77c8480

fix: jsdom, cache tolerance, screenshot pricing

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -24,6 +24,7 @@ import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-a
24
  import { PDFExtractor } from '../services/pdf-extract';
25
  import { DomainBlockade } from '../db/domain-blockade';
26
  import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
 
27
 
28
  const md5Hasher = new HashManager('md5', 'hex');
29
 
@@ -74,6 +75,7 @@ export class CrawlerHost extends RPCHost {
74
  constructor(
75
  protected globalLogger: Logger,
76
  protected puppeteerControl: PuppeteerControl,
 
77
  protected altTextService: AltTextService,
78
  protected pdfExtractor: PDFExtractor,
79
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
@@ -247,7 +249,7 @@ export class CrawlerHost extends RPCHost {
247
  }
248
 
249
  getGeneralSnapshotMixins(snapshot: PageSnapshot) {
250
- const inferred = this.puppeteerControl.inferSnapshot(snapshot);
251
  const mixin: any = {};
252
  if (this.threadLocal.get('withImagesSummary')) {
253
  const imageSummary = {} as { [k: string]: string; };
@@ -296,6 +298,7 @@ export class CrawlerHost extends RPCHost {
296
 
297
  return {
298
  ...this.getGeneralSnapshotMixins(snapshot),
 
299
  screenshotUrl: snapshot.screenshotUrl,
300
  toString() {
301
  return this.screenshotUrl;
@@ -353,16 +356,20 @@ export class CrawlerHost extends RPCHost {
353
  break;
354
  }
355
 
356
- let toBeTurnedToMd = snapshot.html;
 
357
  let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
358
  if (mode !== 'markdown' && snapshot.parsed?.content) {
359
- const par1 = turnDownService.turndown(snapshot.html);
360
- const par2 = turnDownService.turndown(snapshot.parsed.content);
 
361
 
362
  // If Readability did its job
363
  if (par2.length >= 0.3 * par1.length) {
364
  turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
365
- toBeTurnedToMd = snapshot.parsed.content;
 
 
366
  }
367
  }
368
 
@@ -453,7 +460,7 @@ export class CrawlerHost extends RPCHost {
453
 
454
  if (
455
  !contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
456
- && toBeTurnedToMd !== snapshot.html
457
  ) {
458
  try {
459
  contentText = turnDownService.turndown(snapshot.html);
@@ -533,7 +540,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
533
  .value();
534
  }
535
  if (this.threadLocal.get('withLinksSummary')) {
536
- formatted.links = _.invert(this.puppeteerControl.inferSnapshot(snapshot).links || {});
537
  }
538
 
539
  return formatted as FormattedPage;
@@ -890,19 +897,19 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
890
  text: '',
891
  } as PageSnapshot;
892
 
893
- yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts);
894
 
895
  return;
896
  }
897
  let cache;
898
 
899
- const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs;
900
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
901
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
902
  }
903
 
904
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
905
- yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
906
 
907
  return;
908
  }
@@ -910,7 +917,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
910
  try {
911
  if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
912
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
913
- yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
914
  }
915
 
916
  return;
@@ -920,7 +927,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
920
  } catch (err: any) {
921
  if (cache && !(err instanceof SecurityCompromiseError)) {
922
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
923
- yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
924
  return;
925
  }
926
  throw err;
@@ -1051,5 +1058,4 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
1051
 
1052
  return this.formatSnapshot(mode, lastSnapshot, url);
1053
  }
1054
-
1055
  }
 
24
  import { PDFExtractor } from '../services/pdf-extract';
25
  import { DomainBlockade } from '../db/domain-blockade';
26
  import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
27
+ import { JSDomControl } from '../services/jsdom';
28
 
29
  const md5Hasher = new HashManager('md5', 'hex');
30
 
 
75
  constructor(
76
  protected globalLogger: Logger,
77
  protected puppeteerControl: PuppeteerControl,
78
+ protected jsdomControl: JSDomControl,
79
  protected altTextService: AltTextService,
80
  protected pdfExtractor: PDFExtractor,
81
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
 
249
  }
250
 
251
  getGeneralSnapshotMixins(snapshot: PageSnapshot) {
252
+ const inferred = this.jsdomControl.inferSnapshot(snapshot);
253
  const mixin: any = {};
254
  if (this.threadLocal.get('withImagesSummary')) {
255
  const imageSummary = {} as { [k: string]: string; };
 
298
 
299
  return {
300
  ...this.getGeneralSnapshotMixins(snapshot),
301
+ html: snapshot.html,
302
  screenshotUrl: snapshot.screenshotUrl,
303
  toString() {
304
  return this.screenshotUrl;
 
356
  break;
357
  }
358
 
359
+ const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
360
+ let toBeTurnedToMd = jsDomElementOfHTML;
361
  let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
362
  if (mode !== 'markdown' && snapshot.parsed?.content) {
363
+ const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
364
+ const par1 = turnDownService.turndown(jsDomElementOfHTML);
365
+ const par2 = snapshot.parsed.content ? turnDownService.turndown(jsDomElementOfParsed) : '';
366
 
367
  // If Readability did its job
368
  if (par2.length >= 0.3 * par1.length) {
369
  turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
370
+ if (snapshot.parsed.content) {
371
+ toBeTurnedToMd = jsDomElementOfParsed;
372
+ }
373
  }
374
  }
375
 
 
460
 
461
  if (
462
  !contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
463
+ && toBeTurnedToMd !== jsDomElementOfHTML
464
  ) {
465
  try {
466
  contentText = turnDownService.turndown(snapshot.html);
 
540
  .value();
541
  }
542
  if (this.threadLocal.get('withLinksSummary')) {
543
+ formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
544
  }
545
 
546
  return formatted as FormattedPage;
 
897
  text: '',
898
  } as PageSnapshot;
899
 
900
+ yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
901
 
902
  return;
903
  }
904
  let cache;
905
 
906
+ const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
907
  if (cacheTolerance && !crawlOpts?.cookies?.length) {
908
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
909
  }
910
 
911
  if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
912
+ yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
913
 
914
  return;
915
  }
 
917
  try {
918
  if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
919
  for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
920
+ yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
921
  }
922
 
923
  return;
 
927
  } catch (err: any) {
928
  if (cache && !(err instanceof SecurityCompromiseError)) {
929
  this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
930
+ yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
931
  return;
932
  }
933
  throw err;
 
1058
 
1059
  return this.formatSnapshot(mode, lastSnapshot, url);
1060
  }
 
1061
  }
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -173,7 +173,7 @@ export class SearcherHost extends RPCHost {
173
  }
174
 
175
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
176
- { ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs },
177
  count,
178
  );
179
 
 
173
  }
174
 
175
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
176
+ { ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs },
177
  count,
178
  );
179
 
backend/functions/src/services/jsdom.ts ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { container, singleton } from 'tsyringe';
2
+ import { AsyncService, marshalErrorLike } from 'civkit';
3
+ import { Logger } from '../shared/services/logger';
4
+ import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
5
+ import { JSDOM, VirtualConsole } from 'jsdom';
6
+ import { Readability } from '@mozilla/readability';
7
+
8
+ const virtualConsole = new VirtualConsole();
9
+ virtualConsole.on('error', () => void 0);
10
+
11
+ @singleton()
12
+ export class JSDomControl extends AsyncService {
13
+
14
+ logger = this.globalLogger.child({ service: this.constructor.name });
15
+
16
+ constructor(
17
+ protected globalLogger: Logger,
18
+ ) {
19
+ super(...arguments);
20
+ }
21
+
22
+ override async init() {
23
+ await this.dependencyReady();
24
+ this.emit('ready');
25
+ }
26
+
27
+ narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
28
+ targetSelector?: string | string[];
29
+ removeSelector?: string | string[];
30
+ withIframe?: boolean;
31
+ }): PageSnapshot | undefined {
32
+ if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
33
+ return snapshot;
34
+ }
35
+ if (!snapshot?.html) {
36
+ return snapshot;
37
+ }
38
+
39
+ const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
40
+ const allNodes: Node[] = [];
41
+ if (options?.withIframe) {
42
+ jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => {
43
+ const src = x.getAttribute('src');
44
+ const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
45
+ if (thisSnapshot?.html) {
46
+ x.innerHTML = thisSnapshot.html;
47
+ x.querySelectorAll('script, style').forEach((s) => s.remove());
48
+ x.querySelectorAll('[src]').forEach((el) => {
49
+ el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
50
+ });
51
+ x.querySelectorAll('[href]').forEach((el) => {
52
+ el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
53
+ });
54
+ }
55
+ });
56
+ }
57
+
58
+ if (Array.isArray(options?.removeSelector)) {
59
+ for (const rl of options!.removeSelector) {
60
+ jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
61
+ }
62
+ } else if (options?.removeSelector) {
63
+ jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
64
+ }
65
+
66
+ if (Array.isArray(options?.targetSelector)) {
67
+ for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
68
+ x.forEach((el) => {
69
+ if (!allNodes.includes(el)) {
70
+ allNodes.push(el);
71
+ }
72
+ });
73
+ }
74
+ } else if (options?.targetSelector) {
75
+ jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
76
+ if (!allNodes.includes(el)) {
77
+ allNodes.push(el);
78
+ }
79
+ });
80
+ } else {
81
+ allNodes.push(jsdom.window.document);
82
+ }
83
+
84
+ if (!allNodes.length) {
85
+ return snapshot;
86
+ }
87
+ const textChunks: string[] = [];
88
+ let rootDoc: Document;
89
+ if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
90
+ rootDoc = allNodes[0] as any;
91
+ if (rootDoc.body.textContent) {
92
+ textChunks.push(rootDoc.body.textContent);
93
+ }
94
+ } else {
95
+ rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
96
+ for (const n of allNodes) {
97
+ rootDoc.body.appendChild(n);
98
+ rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
99
+ if (n.textContent) {
100
+ textChunks.push(n.textContent);
101
+ }
102
+ }
103
+ }
104
+
105
+ let parsed;
106
+ try {
107
+ parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
108
+ } catch (err: any) {
109
+ this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
110
+ }
111
+
112
+ // No innerText in jsdom
113
+ // https://github.com/jsdom/jsdom/issues/1245
114
+ const textContent = textChunks.join('\n\n');
115
+ const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
116
+
117
+ const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
118
+ .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
119
+ .flat()
120
+ .map((x) => {
121
+ try {
122
+ return new URL(x, snapshot.href).toString();
123
+ } catch (err) {
124
+ return null;
125
+ }
126
+ })
127
+ .filter(Boolean);
128
+
129
+ const imageSet = new Set(imageTags);
130
+
131
+ const r = {
132
+ ...snapshot,
133
+ title: snapshot.title || jsdom.window.document.title,
134
+ parsed,
135
+ html: rootDoc.documentElement.outerHTML,
136
+ text: cleanedText,
137
+ imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
138
+ } as PageSnapshot;
139
+
140
+ return r;
141
+ }
142
+
143
+ inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
144
+ const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
145
+ try {
146
+ const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
147
+ const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
148
+ .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
149
+ .map(([href, text]) => {
150
+ if (!text) {
151
+ return undefined;
152
+ }
153
+ try {
154
+ const parsed = new URL(href, snapshot.href);
155
+ if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
156
+ return undefined;
157
+ }
158
+ return [parsed.toString(), text] as const;
159
+ } catch (err) {
160
+ return undefined;
161
+ }
162
+ })
163
+ .filter(Boolean)
164
+ .reduce((acc, pair) => {
165
+ acc[pair![0]] = pair![1];
166
+ return acc;
167
+ }, {} as { [k: string]: string; });
168
+
169
+ extendedSnapshot.links = links;
170
+
171
+ const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]'))
172
+ .map((x: any) => {
173
+ let linkPreferredSrc = x.getAttribute('src') || '';
174
+ if (linkPreferredSrc.startsWith('data:')) {
175
+ const dataSrc = x.getAttribute('data-src') || '';
176
+ if (dataSrc && !dataSrc.startsWith('data:')) {
177
+ linkPreferredSrc = dataSrc;
178
+ }
179
+ }
180
+
181
+ return {
182
+ src: new URL(linkPreferredSrc, snapshot.href).toString(),
183
+ width: parseInt(x.getAttribute('width') || '0'),
184
+ height: parseInt(x.getAttribute('height') || '0'),
185
+ alt: x.getAttribute('alt') || x.getAttribute('title'),
186
+ };
187
+ });
188
+
189
+ extendedSnapshot.imgs = imgs as any;
190
+ } catch (_err) {
191
+ void 0;
192
+ }
193
+
194
+ return extendedSnapshot;
195
+ }
196
+
197
+ snippetToElement(snippet?: string, url?: string) {
198
+ const parsed = new JSDOM(snippet || '', { url, virtualConsole });
199
+
200
+ return parsed.window.document.documentElement;
201
+ }
202
+ }
203
+
204
+ const jsdomControl = container.resolve(JSDomControl);
205
+
206
+ export default jsdomControl;
backend/functions/src/services/puppeteer.ts CHANGED
@@ -3,7 +3,6 @@ import fs from 'fs';
3
  import { container, singleton } from 'tsyringe';
4
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
6
- import { JSDOM, VirtualConsole } from 'jsdom';
7
 
8
  import type { Browser, CookieParam, Page } from 'puppeteer';
9
  import puppeteer from 'puppeteer-extra';
@@ -11,16 +10,12 @@ import puppeteer from 'puppeteer-extra';
11
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
12
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
14
- import { Readability } from '@mozilla/readability';
15
  import { TimeoutError } from 'puppeteer';
16
  const tldExtract = require('tld-extract');
17
 
18
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
19
 
20
 
21
- const virtualConsole = new VirtualConsole();
22
- virtualConsole.on('error', () => void 0);
23
-
24
  export interface ImgBrief {
25
  src: string;
26
  loaded?: boolean;
@@ -685,175 +680,6 @@ document.addEventListener('load', handlePageLoad);
685
  return r.filter(Boolean);
686
  }
687
 
688
- narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
689
- targetSelector?: string | string[];
690
- removeSelector?: string | string[];
691
- withIframe?: boolean;
692
- }): PageSnapshot | undefined {
693
- if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
694
- return snapshot;
695
- }
696
- if (!snapshot?.html) {
697
- return snapshot;
698
- }
699
-
700
- const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
701
- const allNodes: Node[] = [];
702
- if (options?.withIframe) {
703
- jsdom.window.document.querySelectorAll('iframe[src]').forEach((x) => {
704
- const src = x.getAttribute('src');
705
- const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
706
- if (thisSnapshot?.html) {
707
- x.innerHTML = thisSnapshot.html;
708
- x.querySelectorAll('script, style').forEach((s) => s.remove());
709
- x.querySelectorAll('[src]').forEach((el) => {
710
- el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
711
- });
712
- x.querySelectorAll('[href]').forEach((el) => {
713
- el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
714
- });
715
- }
716
- });
717
- }
718
-
719
- if (Array.isArray(options?.removeSelector)) {
720
- for (const rl of options!.removeSelector) {
721
- jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
722
- }
723
- } else if (options?.removeSelector) {
724
- jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
725
- }
726
-
727
- if (Array.isArray(options?.targetSelector)) {
728
- for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
729
- x.forEach((el) => {
730
- if (!allNodes.includes(el)) {
731
- allNodes.push(el);
732
- }
733
- });
734
- }
735
- } else if (options?.targetSelector) {
736
- jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
737
- if (!allNodes.includes(el)) {
738
- allNodes.push(el);
739
- }
740
- });
741
- } else {
742
- allNodes.push(jsdom.window.document);
743
- }
744
-
745
- if (!allNodes.length) {
746
- return snapshot;
747
- }
748
- const textChunks: string[] = [];
749
- let rootDoc: Document;
750
- if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
751
- rootDoc = allNodes[0] as any;
752
- if (rootDoc.body.textContent) {
753
- textChunks.push(rootDoc.body.textContent);
754
- }
755
- } else {
756
- rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
757
- for (const n of allNodes) {
758
- rootDoc.body.appendChild(n);
759
- rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
760
- if (n.textContent) {
761
- textChunks.push(n.textContent);
762
- }
763
- }
764
- }
765
-
766
- let parsed;
767
- try {
768
- parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
769
- } catch (err: any) {
770
- this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
771
- }
772
-
773
- // No innerText in jsdom
774
- // https://github.com/jsdom/jsdom/issues/1245
775
- const textContent = textChunks.join('\n\n');
776
- const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
777
-
778
- const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
779
- .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
780
- .flat()
781
- .map((x) => {
782
- try {
783
- return new URL(x, snapshot.href).toString();
784
- } catch (err) {
785
- return null;
786
- }
787
- })
788
- .filter(Boolean);
789
-
790
- const imageSet = new Set(imageTags);
791
-
792
- const r = {
793
- ...snapshot,
794
- title: snapshot.title || jsdom.window.document.title,
795
- parsed,
796
- html: rootDoc.documentElement.outerHTML,
797
- text: cleanedText,
798
- imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
799
- } as PageSnapshot;
800
-
801
- return r;
802
- }
803
-
804
- inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
805
- const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
806
- try {
807
- const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
808
- const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
809
- .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
810
- .map(([href, text]) => {
811
- if (!text) {
812
- return undefined;
813
- }
814
- try {
815
- const parsed = new URL(href, snapshot.href);
816
- if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
817
- return undefined;
818
- }
819
- return [parsed.toString(), text] as const;
820
- } catch (err) {
821
- return undefined;
822
- }
823
- })
824
- .filter(Boolean)
825
- .reduce((acc, pair) => {
826
- acc[pair![0]] = pair![1];
827
- return acc;
828
- }, {} as { [k: string]: string; });
829
-
830
- extendedSnapshot.links = links;
831
-
832
- const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]'))
833
- .map((x: any) => {
834
- let linkPreferredSrc = x.getAttribute('src') || '';
835
- if (linkPreferredSrc.startsWith('data:')) {
836
- const dataSrc = x.getAttribute('data-src') || '';
837
- if (dataSrc && !dataSrc.startsWith('data:')) {
838
- linkPreferredSrc = dataSrc;
839
- }
840
- }
841
-
842
- return {
843
- src: new URL(linkPreferredSrc, snapshot.href).toString(),
844
- width: parseInt(x.getAttribute('width') || '0'),
845
- height: parseInt(x.getAttribute('height') || '0'),
846
- alt: x.getAttribute('alt') || x.getAttribute('title'),
847
- };
848
- });
849
-
850
- extendedSnapshot.imgs = imgs as any;
851
- } catch (_err) {
852
- void 0;
853
- }
854
-
855
- return extendedSnapshot;
856
- }
857
  }
858
 
859
  const puppeteerControl = container.resolve(PuppeteerControl);
 
3
  import { container, singleton } from 'tsyringe';
4
  import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
5
  import { Logger } from '../shared/services/logger';
 
6
 
7
  import type { Browser, CookieParam, Page } from 'puppeteer';
8
  import puppeteer from 'puppeteer-extra';
 
10
  import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
11
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
12
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
 
13
  import { TimeoutError } from 'puppeteer';
14
  const tldExtract = require('tld-extract');
15
 
16
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
17
 
18
 
 
 
 
19
  export interface ImgBrief {
20
  src: string;
21
  loaded?: boolean;
 
680
  return r.filter(Boolean);
681
  }
682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  }
684
 
685
  const puppeteerControl = container.resolve(PuppeteerControl);