Spaces:
Build error
Build error
fix: bring back content based respond timing
Browse files- src/api/crawler.ts +10 -7
- src/dto/crawler-options.ts +11 -6
src/api/crawler.ts
CHANGED
|
@@ -793,7 +793,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 793 |
if (!sideLoaded.file) {
|
| 794 |
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 795 |
}
|
| 796 |
-
|
| 797 |
urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
|
| 798 |
).catch((err) => {
|
| 799 |
if (err instanceof ApplicationError) {
|
|
@@ -809,6 +809,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 809 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 810 |
draftSnapshot.title ??= analyzed.title;
|
| 811 |
draftSnapshot.isIntermediate = true;
|
|
|
|
|
|
|
|
|
|
| 812 |
let fallbackProxyIsUsed = false;
|
| 813 |
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
| 814 |
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
|
@@ -825,19 +828,19 @@ export class CrawlerHost extends RPCHost {
|
|
| 825 |
}
|
| 826 |
return Promise.reject(err);
|
| 827 |
});
|
|
|
|
|
|
|
| 828 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 829 |
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 830 |
-
|
| 831 |
-
|
|
|
|
|
|
|
| 832 |
sideLoaded = proxyLoaded;
|
| 833 |
fallbackProxyIsUsed = true;
|
| 834 |
}
|
| 835 |
}
|
| 836 |
|
| 837 |
-
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
| 838 |
-
yield draftSnapshot;
|
| 839 |
-
}
|
| 840 |
-
|
| 841 |
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
|
| 842 |
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
|
| 843 |
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
|
|
|
|
| 793 |
if (!sideLoaded.file) {
|
| 794 |
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
| 795 |
}
|
| 796 |
+
const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(
|
| 797 |
urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
|
| 798 |
).catch((err) => {
|
| 799 |
if (err instanceof ApplicationError) {
|
|
|
|
| 809 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 810 |
draftSnapshot.title ??= analyzed.title;
|
| 811 |
draftSnapshot.isIntermediate = true;
|
| 812 |
+
if (crawlerOpts?.browserIsNotRequired()) {
|
| 813 |
+
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
| 814 |
+
}
|
| 815 |
let fallbackProxyIsUsed = false;
|
| 816 |
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
| 817 |
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
|
|
|
| 828 |
}
|
| 829 |
return Promise.reject(err);
|
| 830 |
});
|
| 831 |
+
if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
|
| 832 |
+
}
|
| 833 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 834 |
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 835 |
+
proxySnapshot.isIntermediate = true;
|
| 836 |
+
if (crawlerOpts?.browserIsNotRequired()) {
|
| 837 |
+
yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
|
| 838 |
+
}
|
| 839 |
sideLoaded = proxyLoaded;
|
| 840 |
fallbackProxyIsUsed = true;
|
| 841 |
}
|
| 842 |
}
|
| 843 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 844 |
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
|
| 845 |
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
|
| 846 |
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
|
src/dto/crawler-options.ts
CHANGED
|
@@ -25,6 +25,7 @@ export enum ENGINE_TYPE {
|
|
| 25 |
|
| 26 |
export enum RESPOND_TIMING {
|
| 27 |
HTML = 'html',
|
|
|
|
| 28 |
MUTATION_IDLE = 'mutation-idle',
|
| 29 |
RESOURCE_IDLE = 'resource-idle',
|
| 30 |
MEDIA_IDLE = 'media-idle',
|
|
@@ -222,11 +223,12 @@ class Viewport extends AutoCastable {
|
|
| 222 |
},
|
| 223 |
'X-Respond-Timing': {
|
| 224 |
description: `Explicitly specify the respond timing. One of the following:\n\n` +
|
| 225 |
-
`- html:
|
|
|
|
| 226 |
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
|
| 227 |
-
`- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading
|
| 228 |
-
`- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading
|
| 229 |
-
`- network-idle: wait for full load of webpage, as
|
| 230 |
in: 'header',
|
| 231 |
schema: { type: 'string' }
|
| 232 |
},
|
|
@@ -600,7 +602,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 600 |
if (this.respondTiming) {
|
| 601 |
return this.respondTiming;
|
| 602 |
}
|
| 603 |
-
if (this.timeout) {
|
| 604 |
return RESPOND_TIMING.NETWORK_IDLE;
|
| 605 |
}
|
| 606 |
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
|
|
@@ -636,6 +638,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 636 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 637 |
return false;
|
| 638 |
}
|
|
|
|
|
|
|
|
|
|
| 639 |
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
| 640 |
return true;
|
| 641 |
}
|
|
@@ -677,7 +682,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 677 |
}
|
| 678 |
|
| 679 |
browserIsNotRequired() {
|
| 680 |
-
if (this.respondTiming &&
|
| 681 |
return false;
|
| 682 |
}
|
| 683 |
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
|
|
|
| 25 |
|
| 26 |
export enum RESPOND_TIMING {
|
| 27 |
HTML = 'html',
|
| 28 |
+
VISIBLE_CONTENT = 'visible-content',
|
| 29 |
MUTATION_IDLE = 'mutation-idle',
|
| 30 |
RESOURCE_IDLE = 'resource-idle',
|
| 31 |
MEDIA_IDLE = 'media-idle',
|
|
|
|
| 223 |
},
|
| 224 |
'X-Respond-Timing': {
|
| 225 |
description: `Explicitly specify the respond timing. One of the following:\n\n` +
|
| 226 |
+
`- html: directly return unrendered HTML\n` +
|
| 227 |
+
`- visible-content: return immediately when any content becomes available\n` +
|
| 228 |
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
|
| 229 |
+
`- resource-idle: wait for no additional resources that would affect page logic and content has SUCCEEDED loading in 0.5s\n` +
|
| 230 |
+
`- media-idle: wait for no additional resources, including media resources, has SUCCEEDED loading in 0.5s\n` +
|
| 231 |
+
`- network-idle: wait for full load of webpage, also known as networkidle0.\n\n`,
|
| 232 |
in: 'header',
|
| 233 |
schema: { type: 'string' }
|
| 234 |
},
|
|
|
|
| 602 |
if (this.respondTiming) {
|
| 603 |
return this.respondTiming;
|
| 604 |
}
|
| 605 |
+
if (this.timeout && this.timeout >= 20) {
|
| 606 |
return RESPOND_TIMING.NETWORK_IDLE;
|
| 607 |
}
|
| 608 |
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
|
|
|
|
| 638 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
| 639 |
return false;
|
| 640 |
}
|
| 641 |
+
if (presumedTiming === RESPOND_TIMING.VISIBLE_CONTENT && snapshot.parsed?.content) {
|
| 642 |
+
return true;
|
| 643 |
+
}
|
| 644 |
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
| 645 |
return true;
|
| 646 |
}
|
|
|
|
| 682 |
}
|
| 683 |
|
| 684 |
browserIsNotRequired() {
|
| 685 |
+
if (this.respondTiming && ![RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(this.respondTiming)) {
|
| 686 |
return false;
|
| 687 |
}
|
| 688 |
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|