Spaces:
Build error
Build error
fix: revert screenshot behavior and introduce pageshot
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -45,6 +45,8 @@ export interface FormattedPage {
|
|
| 45 |
text?: string;
|
| 46 |
screenshotUrl?: string;
|
| 47 |
screenshot?: Buffer;
|
|
|
|
|
|
|
| 48 |
links?: { [k: string]: string; };
|
| 49 |
images?: { [k: string]: string; };
|
| 50 |
|
|
@@ -282,8 +284,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 282 |
return mixin;
|
| 283 |
}
|
| 284 |
|
| 285 |
-
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
| 286 |
screenshotUrl?: string;
|
|
|
|
| 287 |
}, nominalUrl?: URL) {
|
| 288 |
if (mode === 'screenshot') {
|
| 289 |
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
|
@@ -305,6 +308,26 @@ export class CrawlerHost extends RPCHost {
|
|
| 305 |
}
|
| 306 |
} as FormattedPage;
|
| 307 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
if (mode === 'html') {
|
| 309 |
return {
|
| 310 |
...this.getGeneralSnapshotMixins(snapshot),
|
|
@@ -761,6 +784,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 761 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 762 |
);
|
| 763 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
|
| 765 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 766 |
}
|
|
@@ -778,6 +807,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 778 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 779 |
);
|
| 780 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
|
| 782 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 783 |
}
|
|
@@ -810,6 +845,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 810 |
|
| 811 |
let snapshot: PageSnapshot | undefined;
|
| 812 |
let screenshotUrl: string | undefined;
|
|
|
|
| 813 |
const preparations = [
|
| 814 |
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
|
| 815 |
snapshot = JSON.parse(r.toString('utf-8'));
|
|
@@ -818,6 +854,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 818 |
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
| 819 |
screenshotUrl = r;
|
| 820 |
}) :
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 821 |
Promise.resolve(undefined)
|
| 822 |
];
|
| 823 |
try {
|
|
@@ -833,8 +874,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 833 |
snapshot: {
|
| 834 |
...snapshot,
|
| 835 |
screenshot: undefined,
|
|
|
|
| 836 |
screenshotUrl,
|
| 837 |
-
|
|
|
|
| 838 |
};
|
| 839 |
}
|
| 840 |
|
|
@@ -878,6 +921,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 878 |
});
|
| 879 |
cache.screenshotAvailable = true;
|
| 880 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
await savingOfSnapshot;
|
| 882 |
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
| 883 |
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
|
@@ -1013,7 +1064,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 1013 |
const crawlOpts: ExtraScrappingOptions = {
|
| 1014 |
proxyUrl: opts.proxyUrl,
|
| 1015 |
cookies: opts.setCookies,
|
| 1016 |
-
favorScreenshot:
|
| 1017 |
removeSelector: opts.removeSelector,
|
| 1018 |
targetSelector: opts.targetSelector,
|
| 1019 |
waitForSelector: opts.waitForSelector,
|
|
|
|
| 45 |
text?: string;
|
| 46 |
screenshotUrl?: string;
|
| 47 |
screenshot?: Buffer;
|
| 48 |
+
pageshotUrl?: string;
|
| 49 |
+
pageshot?: Buffer;
|
| 50 |
links?: { [k: string]: string; };
|
| 51 |
images?: { [k: string]: string; };
|
| 52 |
|
|
|
|
| 284 |
return mixin;
|
| 285 |
}
|
| 286 |
|
| 287 |
+
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
| 288 |
screenshotUrl?: string;
|
| 289 |
+
pageshotUrl?: string;
|
| 290 |
}, nominalUrl?: URL) {
|
| 291 |
if (mode === 'screenshot') {
|
| 292 |
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
|
|
|
| 308 |
}
|
| 309 |
} as FormattedPage;
|
| 310 |
}
|
| 311 |
+
if (mode === 'pageshot') {
|
| 312 |
+
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
| 313 |
+
const fid = `instant-screenshots/${randomUUID()}`;
|
| 314 |
+
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
| 315 |
+
metadata: {
|
| 316 |
+
contentType: 'image/png',
|
| 317 |
+
}
|
| 318 |
+
});
|
| 319 |
+
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
return {
|
| 323 |
+
...this.getGeneralSnapshotMixins(snapshot),
|
| 324 |
+
html: snapshot.html,
|
| 325 |
+
pageshotUrl: snapshot.pageshotUrl,
|
| 326 |
+
toString() {
|
| 327 |
+
return this.pageshotUrl;
|
| 328 |
+
}
|
| 329 |
+
} as FormattedPage;
|
| 330 |
+
}
|
| 331 |
if (mode === 'html') {
|
| 332 |
return {
|
| 333 |
...this.getGeneralSnapshotMixins(snapshot),
|
|
|
|
| 784 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 785 |
);
|
| 786 |
}
|
| 787 |
+
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 788 |
+
|
| 789 |
+
return assignTransferProtocolMeta(`${formatted}`,
|
| 790 |
+
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 791 |
+
);
|
| 792 |
+
}
|
| 793 |
|
| 794 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 795 |
}
|
|
|
|
| 807 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 808 |
);
|
| 809 |
}
|
| 810 |
+
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 811 |
+
|
| 812 |
+
return assignTransferProtocolMeta(`${formatted}`,
|
| 813 |
+
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 814 |
+
);
|
| 815 |
+
}
|
| 816 |
|
| 817 |
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 818 |
}
|
|
|
|
| 845 |
|
| 846 |
let snapshot: PageSnapshot | undefined;
|
| 847 |
let screenshotUrl: string | undefined;
|
| 848 |
+
let pageshotUrl: string | undefined;
|
| 849 |
const preparations = [
|
| 850 |
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
|
| 851 |
snapshot = JSON.parse(r.toString('utf-8'));
|
|
|
|
| 854 |
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
| 855 |
screenshotUrl = r;
|
| 856 |
}) :
|
| 857 |
+
Promise.resolve(undefined),
|
| 858 |
+
cache.pageshotAvailable ?
|
| 859 |
+
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
| 860 |
+
pageshotUrl = r;
|
| 861 |
+
}) :
|
| 862 |
Promise.resolve(undefined)
|
| 863 |
];
|
| 864 |
try {
|
|
|
|
| 874 |
snapshot: {
|
| 875 |
...snapshot,
|
| 876 |
screenshot: undefined,
|
| 877 |
+
pageshot: undefined,
|
| 878 |
screenshotUrl,
|
| 879 |
+
pageshotUrl,
|
| 880 |
+
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
|
| 881 |
};
|
| 882 |
}
|
| 883 |
|
|
|
|
| 921 |
});
|
| 922 |
cache.screenshotAvailable = true;
|
| 923 |
}
|
| 924 |
+
if (snapshot.pageshot) {
|
| 925 |
+
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
|
| 926 |
+
metadata: {
|
| 927 |
+
contentType: 'image/png',
|
| 928 |
+
}
|
| 929 |
+
});
|
| 930 |
+
cache.pageshotAvailable = true;
|
| 931 |
+
}
|
| 932 |
await savingOfSnapshot;
|
| 933 |
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
| 934 |
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
|
|
|
| 1064 |
const crawlOpts: ExtraScrappingOptions = {
|
| 1065 |
proxyUrl: opts.proxyUrl,
|
| 1066 |
cookies: opts.setCookies,
|
| 1067 |
+
favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
|
| 1068 |
removeSelector: opts.removeSelector,
|
| 1069 |
targetSelector: opts.targetSelector,
|
| 1070 |
waitForSelector: opts.waitForSelector,
|
backend/functions/src/db/crawled.ts
CHANGED
|
@@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
|
|
| 22 |
urlPathDigest!: string;
|
| 23 |
|
| 24 |
@Prop()
|
| 25 |
-
snapshot?: PageSnapshot & { screenshot: never; };
|
| 26 |
|
| 27 |
@Prop()
|
| 28 |
screenshotAvailable?: boolean;
|
| 29 |
|
|
|
|
|
|
|
|
|
|
| 30 |
@Prop()
|
| 31 |
snapshotAvailable?: boolean;
|
| 32 |
|
|
|
|
| 22 |
urlPathDigest!: string;
|
| 23 |
|
| 24 |
@Prop()
|
| 25 |
+
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
| 26 |
|
| 27 |
@Prop()
|
| 28 |
screenshotAvailable?: boolean;
|
| 29 |
|
| 30 |
+
@Prop()
|
| 31 |
+
pageshotAvailable?: boolean;
|
| 32 |
+
|
| 33 |
@Prop()
|
| 34 |
snapshotAvailable?: boolean;
|
| 35 |
|
backend/functions/src/dto/scrapping-options.ts
CHANGED
|
@@ -34,6 +34,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|
| 34 |
`- markdown\n` +
|
| 35 |
`- html\n` +
|
| 36 |
`- text\n` +
|
|
|
|
| 37 |
`- screenshot\n`
|
| 38 |
,
|
| 39 |
in: 'header',
|
|
|
|
| 34 |
`- markdown\n` +
|
| 35 |
`- html\n` +
|
| 36 |
`- text\n` +
|
| 37 |
+
`- pageshot\n` +
|
| 38 |
`- screenshot\n`
|
| 39 |
,
|
| 40 |
in: 'header',
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -46,6 +46,7 @@ export interface PageSnapshot {
|
|
| 46 |
text: string;
|
| 47 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 48 |
screenshot?: Buffer;
|
|
|
|
| 49 |
imgs?: ImgBrief[];
|
| 50 |
pdfs?: string[];
|
| 51 |
maxElemDepth?: number;
|
|
@@ -448,6 +449,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 448 |
|
| 449 |
let snapshot: PageSnapshot | undefined;
|
| 450 |
let screenshot: Buffer | undefined;
|
|
|
|
| 451 |
const page = await this.getNextPage();
|
| 452 |
const sn = this.snMap.get(page);
|
| 453 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
|
@@ -524,7 +526,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 524 |
try {
|
| 525 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 526 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 527 |
-
screenshot = await page.screenshot(
|
| 528 |
if (snapshot) {
|
| 529 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 530 |
}
|
|
@@ -547,7 +549,8 @@ document.addEventListener('load', handlePageLoad);
|
|
| 547 |
if (salvaged) {
|
| 548 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 549 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 550 |
-
screenshot = await page.screenshot(
|
|
|
|
| 551 |
if (snapshot) {
|
| 552 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 553 |
}
|
|
@@ -562,7 +565,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 562 |
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 563 |
this.emit(
|
| 564 |
'crawled',
|
| 565 |
-
{ ...snapshot, screenshot },
|
| 566 |
{ ...options, url: parsedUrl }
|
| 567 |
);
|
| 568 |
}
|
|
@@ -581,7 +584,8 @@ document.addEventListener('load', handlePageLoad);
|
|
| 581 |
.then(async () => {
|
| 582 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 583 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 584 |
-
screenshot = await page.screenshot(
|
|
|
|
| 585 |
if (snapshot) {
|
| 586 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 587 |
}
|
|
@@ -614,15 +618,16 @@ document.addEventListener('load', handlePageLoad);
|
|
| 614 |
}
|
| 615 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 616 |
}
|
| 617 |
-
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 618 |
break;
|
| 619 |
}
|
| 620 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 621 |
-
screenshot = await page.screenshot(
|
|
|
|
| 622 |
lastHTML = snapshot.html;
|
| 623 |
}
|
| 624 |
if (snapshot || screenshot) {
|
| 625 |
-
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 626 |
}
|
| 627 |
if (error) {
|
| 628 |
throw error;
|
|
|
|
| 46 |
text: string;
|
| 47 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 48 |
screenshot?: Buffer;
|
| 49 |
+
pageshot?: Buffer;
|
| 50 |
imgs?: ImgBrief[];
|
| 51 |
pdfs?: string[];
|
| 52 |
maxElemDepth?: number;
|
|
|
|
| 449 |
|
| 450 |
let snapshot: PageSnapshot | undefined;
|
| 451 |
let screenshot: Buffer | undefined;
|
| 452 |
+
let pageshot: Buffer | undefined;
|
| 453 |
const page = await this.getNextPage();
|
| 454 |
const sn = this.snMap.get(page);
|
| 455 |
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
|
|
|
| 526 |
try {
|
| 527 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 528 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 529 |
+
screenshot = await page.screenshot();
|
| 530 |
if (snapshot) {
|
| 531 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 532 |
}
|
|
|
|
| 549 |
if (salvaged) {
|
| 550 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 551 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 552 |
+
screenshot = await page.screenshot();
|
| 553 |
+
pageshot = await page.screenshot({ fullPage: true });
|
| 554 |
if (snapshot) {
|
| 555 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 556 |
}
|
|
|
|
| 565 |
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 566 |
this.emit(
|
| 567 |
'crawled',
|
| 568 |
+
{ ...snapshot, screenshot, pageshot },
|
| 569 |
{ ...options, url: parsedUrl }
|
| 570 |
);
|
| 571 |
}
|
|
|
|
| 584 |
.then(async () => {
|
| 585 |
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 586 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 587 |
+
screenshot = await page.screenshot();
|
| 588 |
+
pageshot = await page.screenshot({ fullPage: true });
|
| 589 |
if (snapshot) {
|
| 590 |
snapshot.childFrames = await pSubFrameSnapshots;
|
| 591 |
}
|
|
|
|
| 618 |
}
|
| 619 |
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 620 |
}
|
| 621 |
+
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
| 622 |
break;
|
| 623 |
}
|
| 624 |
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
| 625 |
+
screenshot = await page.screenshot();
|
| 626 |
+
pageshot = await page.screenshot({ fullPage: true });
|
| 627 |
lastHTML = snapshot.html;
|
| 628 |
}
|
| 629 |
if (snapshot || screenshot) {
|
| 630 |
+
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
| 631 |
}
|
| 632 |
if (error) {
|
| 633 |
throw error;
|