nomagick commited on
Commit
9415c6a
·
unverified ·
1 Parent(s): 6027963

fix: track if snapshot html modified by js

Browse files
src/api/crawler.ts CHANGED
@@ -15,7 +15,7 @@ import { Defer } from 'civkit/defer';
15
  import { retryWith } from 'civkit/decorators';
16
  import { FancyFile } from 'civkit/fancy-file';
17
 
18
- import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
19
 
20
  import { Crawled } from '../db/crawled';
21
  import { DomainBlockade } from '../db/domain-blockade';
@@ -585,6 +585,7 @@ export class CrawlerHost extends RPCHost {
585
  url: urlToCrawl.toString(),
586
  createdAt: nowDate,
587
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
 
588
  urlPathDigest: digest,
589
  });
590
 
@@ -732,6 +733,12 @@ export class CrawlerHost extends RPCHost {
732
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
733
  }
734
 
 
 
 
 
 
 
735
  if (cache?.isFresh &&
736
  (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
737
  (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
 
15
  import { retryWith } from 'civkit/decorators';
16
  import { FancyFile } from 'civkit/fancy-file';
17
 
18
+ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
19
 
20
  import { Crawled } from '../db/crawled';
21
  import { DomainBlockade } from '../db/domain-blockade';
 
585
  url: urlToCrawl.toString(),
586
  createdAt: nowDate,
587
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
588
+ htmlModifiedByJs: snapshot.htmlModifiedByJs,
589
  urlPathDigest: digest,
590
  });
591
 
 
733
  cache = await this.queryCache(urlToCrawl, cacheTolerance);
734
  }
735
 
736
+ if (cache?.htmlModifiedByJs === false) {
737
+ if (crawlerOpts) {
738
+ crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
739
+ }
740
+ }
741
+
742
  if (cache?.isFresh &&
743
  (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
744
  (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
src/db/crawled.ts CHANGED
@@ -21,6 +21,9 @@ export class Crawled extends FirestoreRecord {
21
  })
22
  urlPathDigest!: string;
23
 
 
 
 
24
  @Prop()
25
  snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
26
 
 
21
  })
22
  urlPathDigest!: string;
23
 
24
+ @Prop()
25
+ htmlModifiedByJs?: boolean;
26
+
27
  @Prop()
28
  snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
29
 
src/dto/crawler-options.ts CHANGED
@@ -584,13 +584,6 @@ export class CrawlerOptions extends AutoCastable {
584
  if (respondTiming) {
585
  instance.respondTiming ??= respondTiming as RESPOND_TIMING;
586
  }
587
- if (instance.timeout) {
588
- instance.respondTiming ??= RESPOND_TIMING.NETWORK_IDLE;
589
- }
590
- if (instance.respondWith.includes('shot') || instance.respondWith.includes('vlm')) {
591
- instance.respondTiming ??= RESPOND_TIMING.MEDIA_IDLE;
592
- }
593
- instance.respondTiming ??= RESPOND_TIMING.RESOURCE_IDLE;
594
 
595
  if (instance.cacheTolerance) {
596
  instance.cacheTolerance = instance.cacheTolerance * 1000;
@@ -603,14 +596,29 @@ export class CrawlerOptions extends AutoCastable {
603
  return instance;
604
  }
605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
607
  if (this.waitForSelector?.length) {
608
  return false;
609
  }
610
- if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
 
611
  return true;
612
  }
613
- if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
614
  const now = Date.now();
615
  if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
616
  return true;
@@ -622,7 +630,7 @@ export class CrawlerOptions extends AutoCastable {
622
  if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
623
  return false;
624
  }
625
- if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
626
  const now = Date.now();
627
  if ((snapshot.lastContentResourceLoaded + 500) < now) {
628
  return true;
@@ -632,10 +640,10 @@ export class CrawlerOptions extends AutoCastable {
632
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
633
  return false;
634
  }
635
- if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
636
  return false;
637
  }
638
- if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
639
  return true;
640
  }
641
  if (this.respondWith.includes('lm')) {
 
584
  if (respondTiming) {
585
  instance.respondTiming ??= respondTiming as RESPOND_TIMING;
586
  }
 
 
 
 
 
 
 
587
 
588
  if (instance.cacheTolerance) {
589
  instance.cacheTolerance = instance.cacheTolerance * 1000;
 
596
  return instance;
597
  }
598
 
599
+ get presumedRespondTiming() {
600
+ if (this.respondTiming) {
601
+ return this.respondTiming;
602
+ }
603
+ if (this.timeout) {
604
+ return RESPOND_TIMING.NETWORK_IDLE;
605
+ }
606
+ if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
607
+ return RESPOND_TIMING.MEDIA_IDLE;
608
+ }
609
+
610
+ return RESPOND_TIMING.RESOURCE_IDLE;
611
+ }
612
+
613
  isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
614
  if (this.waitForSelector?.length) {
615
  return false;
616
  }
617
+ const presumedTiming = this.presumedRespondTiming;
618
+ if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
619
  return true;
620
  }
621
+ if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
622
  const now = Date.now();
623
  if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
624
  return true;
 
630
  if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
631
  return false;
632
  }
633
+ if (presumedTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
634
  const now = Date.now();
635
  if ((snapshot.lastContentResourceLoaded + 500) < now) {
636
  return true;
 
640
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
641
  return false;
642
  }
643
+ if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
644
  return false;
645
  }
646
+ if (presumedTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
647
  return true;
648
  }
649
  if (this.respondWith.includes('lm')) {
src/services/puppeteer.ts CHANGED
@@ -55,6 +55,7 @@ export interface PageSnapshot {
55
  href: string;
56
  rebase?: string;
57
  html: string;
 
58
  shadowExpanded?: string;
59
  text: string;
60
  status?: number;
@@ -377,9 +378,11 @@ function shadowDomPresent(rootElement = document.documentElement) {
377
  }
378
 
379
  let lastMutationIdle = 0;
 
380
  document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
381
 
382
  function giveSnapshot(stopActiveSnapshot) {
 
383
  if (stopActiveSnapshot) {
384
  window.haltSnapshot = true;
385
  }
@@ -395,6 +398,7 @@ function giveSnapshot(stopActiveSnapshot) {
395
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
396
  href: document.location.href,
397
  html: document.documentElement?.outerHTML,
 
398
  text: document.body?.innerText,
399
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
400
  parsed: parsed,
@@ -403,6 +407,9 @@ function giveSnapshot(stopActiveSnapshot) {
403
  elemCount: domAnalysis.elementCount,
404
  lastMutationIdle,
405
  };
 
 
 
406
  if (document.baseURI !== r.href) {
407
  r.rebase = document.baseURI;
408
  }
 
55
  href: string;
56
  rebase?: string;
57
  html: string;
58
+ htmlModifiedByJs?: boolean;
59
  shadowExpanded?: string;
60
  text: string;
61
  status?: number;
 
378
  }
379
 
380
  let lastMutationIdle = 0;
381
+ let initialHTML;
382
  document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
383
 
384
  function giveSnapshot(stopActiveSnapshot) {
385
+ initialHTML ??= document.documentElement?.outerHTML;
386
  if (stopActiveSnapshot) {
387
  window.haltSnapshot = true;
388
  }
 
398
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
399
  href: document.location.href,
400
  html: document.documentElement?.outerHTML,
401
+ htmlModifiedByJs: false,
402
  text: document.body?.innerText,
403
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
404
  parsed: parsed,
 
407
  elemCount: domAnalysis.elementCount,
408
  lastMutationIdle,
409
  };
410
+ if (initialHTML) {
411
+ r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
412
+ }
413
  if (document.baseURI !== r.href) {
414
  r.rebase = document.baseURI;
415
  }