Spaces:
Build error
Build error
fix: track if snapshot html modified by js
Browse files- src/api/crawler.ts +8 -1
- src/db/crawled.ts +3 -0
- src/dto/crawler-options.ts +20 -12
- src/services/puppeteer.ts +7 -0
src/api/crawler.ts
CHANGED
|
@@ -15,7 +15,7 @@ import { Defer } from 'civkit/defer';
|
|
| 15 |
import { retryWith } from 'civkit/decorators';
|
| 16 |
import { FancyFile } from 'civkit/fancy-file';
|
| 17 |
|
| 18 |
-
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
| 19 |
|
| 20 |
import { Crawled } from '../db/crawled';
|
| 21 |
import { DomainBlockade } from '../db/domain-blockade';
|
|
@@ -585,6 +585,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 585 |
url: urlToCrawl.toString(),
|
| 586 |
createdAt: nowDate,
|
| 587 |
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
|
|
|
| 588 |
urlPathDigest: digest,
|
| 589 |
});
|
| 590 |
|
|
@@ -732,6 +733,12 @@ export class CrawlerHost extends RPCHost {
|
|
| 732 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 733 |
}
|
| 734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
if (cache?.isFresh &&
|
| 736 |
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
| 737 |
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
|
|
|
| 15 |
import { retryWith } from 'civkit/decorators';
|
| 16 |
import { FancyFile } from 'civkit/fancy-file';
|
| 17 |
|
| 18 |
+
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
|
| 19 |
|
| 20 |
import { Crawled } from '../db/crawled';
|
| 21 |
import { DomainBlockade } from '../db/domain-blockade';
|
|
|
|
| 585 |
url: urlToCrawl.toString(),
|
| 586 |
createdAt: nowDate,
|
| 587 |
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
| 588 |
+
htmlModifiedByJs: snapshot.htmlModifiedByJs,
|
| 589 |
urlPathDigest: digest,
|
| 590 |
});
|
| 591 |
|
|
|
|
| 733 |
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
| 734 |
}
|
| 735 |
|
| 736 |
+
if (cache?.htmlModifiedByJs === false) {
|
| 737 |
+
if (crawlerOpts) {
|
| 738 |
+
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
|
| 739 |
+
}
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
if (cache?.isFresh &&
|
| 743 |
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
| 744 |
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
src/db/crawled.ts
CHANGED
|
@@ -21,6 +21,9 @@ export class Crawled extends FirestoreRecord {
|
|
| 21 |
})
|
| 22 |
urlPathDigest!: string;
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
@Prop()
|
| 25 |
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
| 26 |
|
|
|
|
| 21 |
})
|
| 22 |
urlPathDigest!: string;
|
| 23 |
|
| 24 |
+
@Prop()
|
| 25 |
+
htmlModifiedByJs?: boolean;
|
| 26 |
+
|
| 27 |
@Prop()
|
| 28 |
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
| 29 |
|
src/dto/crawler-options.ts
CHANGED
|
@@ -584,13 +584,6 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 584 |
if (respondTiming) {
|
| 585 |
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
| 586 |
}
|
| 587 |
-
if (instance.timeout) {
|
| 588 |
-
instance.respondTiming ??= RESPOND_TIMING.NETWORK_IDLE;
|
| 589 |
-
}
|
| 590 |
-
if (instance.respondWith.includes('shot') || instance.respondWith.includes('vlm')) {
|
| 591 |
-
instance.respondTiming ??= RESPOND_TIMING.MEDIA_IDLE;
|
| 592 |
-
}
|
| 593 |
-
instance.respondTiming ??= RESPOND_TIMING.RESOURCE_IDLE;
|
| 594 |
|
| 595 |
if (instance.cacheTolerance) {
|
| 596 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
|
@@ -603,14 +596,29 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 603 |
return instance;
|
| 604 |
}
|
| 605 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
| 607 |
if (this.waitForSelector?.length) {
|
| 608 |
return false;
|
| 609 |
}
|
| 610 |
-
|
|
|
|
| 611 |
return true;
|
| 612 |
}
|
| 613 |
-
if (
|
| 614 |
const now = Date.now();
|
| 615 |
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
| 616 |
return true;
|
|
@@ -622,7 +630,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 622 |
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
| 623 |
return false;
|
| 624 |
}
|
| 625 |
-
if (
|
| 626 |
const now = Date.now();
|
| 627 |
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
| 628 |
return true;
|
|
@@ -632,10 +640,10 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 632 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 633 |
return false;
|
| 634 |
}
|
| 635 |
-
if (
|
| 636 |
return false;
|
| 637 |
}
|
| 638 |
-
if (
|
| 639 |
return true;
|
| 640 |
}
|
| 641 |
if (this.respondWith.includes('lm')) {
|
|
|
|
| 584 |
if (respondTiming) {
|
| 585 |
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
| 586 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
if (instance.cacheTolerance) {
|
| 589 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
|
|
|
| 596 |
return instance;
|
| 597 |
}
|
| 598 |
|
| 599 |
+
get presumedRespondTiming() {
|
| 600 |
+
if (this.respondTiming) {
|
| 601 |
+
return this.respondTiming;
|
| 602 |
+
}
|
| 603 |
+
if (this.timeout) {
|
| 604 |
+
return RESPOND_TIMING.NETWORK_IDLE;
|
| 605 |
+
}
|
| 606 |
+
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
|
| 607 |
+
return RESPOND_TIMING.MEDIA_IDLE;
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
return RESPOND_TIMING.RESOURCE_IDLE;
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
| 614 |
if (this.waitForSelector?.length) {
|
| 615 |
return false;
|
| 616 |
}
|
| 617 |
+
const presumedTiming = this.presumedRespondTiming;
|
| 618 |
+
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
| 619 |
return true;
|
| 620 |
}
|
| 621 |
+
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
|
| 622 |
const now = Date.now();
|
| 623 |
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
| 624 |
return true;
|
|
|
|
| 630 |
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
| 631 |
return false;
|
| 632 |
}
|
| 633 |
+
if (presumedTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
|
| 634 |
const now = Date.now();
|
| 635 |
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
| 636 |
return true;
|
|
|
|
| 640 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 641 |
return false;
|
| 642 |
}
|
| 643 |
+
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
| 644 |
return false;
|
| 645 |
}
|
| 646 |
+
if (presumedTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
| 647 |
return true;
|
| 648 |
}
|
| 649 |
if (this.respondWith.includes('lm')) {
|
src/services/puppeteer.ts
CHANGED
|
@@ -55,6 +55,7 @@ export interface PageSnapshot {
|
|
| 55 |
href: string;
|
| 56 |
rebase?: string;
|
| 57 |
html: string;
|
|
|
|
| 58 |
shadowExpanded?: string;
|
| 59 |
text: string;
|
| 60 |
status?: number;
|
|
@@ -377,9 +378,11 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
|
| 377 |
}
|
| 378 |
|
| 379 |
let lastMutationIdle = 0;
|
|
|
|
| 380 |
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
| 381 |
|
| 382 |
function giveSnapshot(stopActiveSnapshot) {
|
|
|
|
| 383 |
if (stopActiveSnapshot) {
|
| 384 |
window.haltSnapshot = true;
|
| 385 |
}
|
|
@@ -395,6 +398,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 395 |
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
| 396 |
href: document.location.href,
|
| 397 |
html: document.documentElement?.outerHTML,
|
|
|
|
| 398 |
text: document.body?.innerText,
|
| 399 |
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
| 400 |
parsed: parsed,
|
|
@@ -403,6 +407,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 403 |
elemCount: domAnalysis.elementCount,
|
| 404 |
lastMutationIdle,
|
| 405 |
};
|
|
|
|
|
|
|
|
|
|
| 406 |
if (document.baseURI !== r.href) {
|
| 407 |
r.rebase = document.baseURI;
|
| 408 |
}
|
|
|
|
| 55 |
href: string;
|
| 56 |
rebase?: string;
|
| 57 |
html: string;
|
| 58 |
+
htmlModifiedByJs?: boolean;
|
| 59 |
shadowExpanded?: string;
|
| 60 |
text: string;
|
| 61 |
status?: number;
|
|
|
|
| 378 |
}
|
| 379 |
|
| 380 |
let lastMutationIdle = 0;
|
| 381 |
+
let initialHTML;
|
| 382 |
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
| 383 |
|
| 384 |
function giveSnapshot(stopActiveSnapshot) {
|
| 385 |
+
initialHTML ??= document.documentElement?.outerHTML;
|
| 386 |
if (stopActiveSnapshot) {
|
| 387 |
window.haltSnapshot = true;
|
| 388 |
}
|
|
|
|
| 398 |
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
| 399 |
href: document.location.href,
|
| 400 |
html: document.documentElement?.outerHTML,
|
| 401 |
+
htmlModifiedByJs: false,
|
| 402 |
text: document.body?.innerText,
|
| 403 |
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
| 404 |
parsed: parsed,
|
|
|
|
| 407 |
elemCount: domAnalysis.elementCount,
|
| 408 |
lastMutationIdle,
|
| 409 |
};
|
| 410 |
+
if (initialHTML) {
|
| 411 |
+
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
| 412 |
+
}
|
| 413 |
if (document.baseURI !== r.href) {
|
| 414 |
r.rebase = document.baseURI;
|
| 415 |
}
|