Spaces:
Build error
Build error
fix
Browse files- backend/functions/firebase-export-1712748362961bSfwZx/firestore_export/firestore_export.overall_export_metadata +0 -0
- backend/functions/package.json +1 -1
- backend/functions/src/cloud-functions/crawler.ts +98 -25
- backend/functions/src/db/crawled.ts +59 -0
- backend/functions/src/index.ts +14 -15
- backend/functions/src/services/puppeteer.ts +131 -33
backend/functions/firebase-export-1712748362961bSfwZx/firestore_export/firestore_export.overall_export_metadata
DELETED
|
Binary file (15 Bytes)
|
|
|
backend/functions/package.json
CHANGED
|
@@ -32,7 +32,7 @@
|
|
| 32 |
"archiver": "^6.0.1",
|
| 33 |
"axios": "^1.3.3",
|
| 34 |
"bcrypt": "^5.1.0",
|
| 35 |
-
"civkit": "^0.6.5-
|
| 36 |
"cors": "^2.8.5",
|
| 37 |
"dayjs": "^1.11.9",
|
| 38 |
"express": "^4.19.2",
|
|
|
|
| 32 |
"archiver": "^6.0.1",
|
| 33 |
"axios": "^1.3.3",
|
| 34 |
"bcrypt": "^5.1.0",
|
| 35 |
+
"civkit": "^0.6.5-326469b",
|
| 36 |
"cors": "^2.8.5",
|
| 37 |
"dayjs": "^1.11.9",
|
| 38 |
"express": "^4.19.2",
|
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
-
import { marshalErrorLike, RPCHost, RPCReflection } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
-
import { CloudHTTPv2, Logger, OutputServerEventStream,
|
| 4 |
import _ from 'lodash';
|
| 5 |
-
import { PuppeteerControl } from '../services/puppeteer';
|
| 6 |
import TurnDownService from 'turndown';
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
@singleton()
|
|
@@ -25,43 +26,115 @@ export class CrawlerHost extends RPCHost {
|
|
| 25 |
this.emit('ready');
|
| 26 |
}
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
@CloudHTTPv2({
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
httpMethod: ['get', 'post'],
|
| 31 |
-
returnType: OutputServerEventStream,
|
| 32 |
})
|
| 33 |
async crawl(
|
| 34 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 35 |
-
@
|
|
|
|
|
|
|
|
|
|
| 36 |
) {
|
| 37 |
-
|
| 38 |
-
const
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
}
|
| 48 |
-
|
|
|
|
| 49 |
sseStream.write({
|
| 50 |
-
event: '
|
| 51 |
-
data:
|
| 52 |
});
|
| 53 |
}
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
data: err,
|
| 59 |
-
});
|
| 60 |
}
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
|
| 65 |
}
|
| 66 |
|
| 67 |
|
|
|
|
| 1 |
+
import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
+
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
| 4 |
import _ from 'lodash';
|
| 5 |
+
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
|
| 6 |
import TurnDownService from 'turndown';
|
| 7 |
+
import { Request, Response } from 'express';
|
| 8 |
|
| 9 |
|
| 10 |
@singleton()
|
|
|
|
| 26 |
this.emit('ready');
|
| 27 |
}
|
| 28 |
|
| 29 |
+
formatSnapshot(snapshot: PageSnapshot) {
|
| 30 |
+
|
| 31 |
+
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 32 |
+
const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
|
| 33 |
+
|
| 34 |
+
const formatted = `Title: ${(snapshot.parsed?.title || snapshot.title || '').trim()}
|
| 35 |
+
|
| 36 |
+
URL Source: ${snapshot.href.trim()}
|
| 37 |
+
|
| 38 |
+
Markdown Content:
|
| 39 |
+
${contentText.trim()}
|
| 40 |
+
`;
|
| 41 |
+
|
| 42 |
+
return formatted;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
@CloudHTTPv2({
|
| 46 |
+
runtime: {
|
| 47 |
+
memory: '4GiB',
|
| 48 |
+
timeoutSeconds: 540,
|
| 49 |
+
},
|
| 50 |
httpMethod: ['get', 'post'],
|
| 51 |
+
returnType: [String, OutputServerEventStream],
|
| 52 |
})
|
| 53 |
async crawl(
|
| 54 |
@RPCReflect() rpcReflect: RPCReflection,
|
| 55 |
+
@Ctx() ctx: {
|
| 56 |
+
req: Request,
|
| 57 |
+
res: Response,
|
| 58 |
+
},
|
| 59 |
) {
|
| 60 |
+
const url = new URL(ctx.req.url, `${ctx.req.protocol}://${ctx.req.headers.host}`);
|
| 61 |
+
const rawPath = url.pathname.split('/').filter(Boolean);
|
| 62 |
+
const host = rawPath.shift();
|
| 63 |
+
const urlToCrawl = new URL(`${ctx.req.protocol}://${host}/${rawPath.join('/')}`);
|
| 64 |
+
urlToCrawl.search = url.search;
|
| 65 |
|
| 66 |
+
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 67 |
+
const sseStream = new OutputServerEventStream();
|
| 68 |
+
rpcReflect.return(sseStream);
|
| 69 |
|
| 70 |
+
try {
|
| 71 |
+
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
| 72 |
+
if (!scrapped) {
|
| 73 |
+
continue;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
const formatted = this.formatSnapshot(scrapped);
|
| 77 |
+
|
| 78 |
+
if (scrapped.screenshot) {
|
| 79 |
+
sseStream.write({
|
| 80 |
+
event: 'screenshot',
|
| 81 |
+
data: scrapped.screenshot.toString('base64'),
|
| 82 |
+
});
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
sseStream.write({
|
| 86 |
+
event: 'data',
|
| 87 |
+
data: formatted,
|
| 88 |
+
});
|
| 89 |
}
|
| 90 |
+
} catch (err: any) {
|
| 91 |
+
this.logger.error(`Failed to crawl ${url}`, { err: marshalErrorLike(err) });
|
| 92 |
sseStream.write({
|
| 93 |
+
event: 'error',
|
| 94 |
+
data: marshalErrorLike(err),
|
| 95 |
});
|
| 96 |
}
|
| 97 |
+
|
| 98 |
+
sseStream.end();
|
| 99 |
+
|
| 100 |
+
return sseStream;
|
|
|
|
|
|
|
| 101 |
}
|
| 102 |
|
| 103 |
+
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 104 |
+
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
| 105 |
+
if (!scrapped?.parsed?.content) {
|
| 106 |
+
continue;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
const formatted = this.formatSnapshot(scrapped);
|
| 110 |
+
|
| 111 |
+
if (scrapped.screenshot) {
|
| 112 |
+
|
| 113 |
+
return [
|
| 114 |
+
{
|
| 115 |
+
type: 'image_url', image_url: {
|
| 116 |
+
url: `data:image/jpeg;base64,${scrapped.screenshot.toString('base64')}`,
|
| 117 |
+
}
|
| 118 |
+
},
|
| 119 |
+
{ type: 'text', content: formatted },
|
| 120 |
+
];
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
return formatted;
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
| 128 |
+
if (!scrapped?.parsed?.content) {
|
| 129 |
+
continue;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
const formatted = this.formatSnapshot(scrapped);
|
| 133 |
+
|
| 134 |
+
return assignTransferProtocolMeta(formatted, { contentType: 'text/plain', envelope: null });
|
| 135 |
+
}
|
| 136 |
|
| 137 |
+
throw new Error('Unreachable');
|
| 138 |
}
|
| 139 |
|
| 140 |
|
backend/functions/src/db/crawled.ts
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Also, parseJSONText, Prop } from 'civkit';
|
| 2 |
+
import { FirestoreRecord } from '../shared/lib/firestore';
|
| 3 |
+
import _ from 'lodash';
|
| 4 |
+
|
| 5 |
+
@Also({
|
| 6 |
+
dictOf: Object
|
| 7 |
+
})
|
| 8 |
+
export class Crawled extends FirestoreRecord {
|
| 9 |
+
static override collectionName = 'crawled';
|
| 10 |
+
|
| 11 |
+
override _id!: string;
|
| 12 |
+
|
| 13 |
+
@Prop({
|
| 14 |
+
required: true
|
| 15 |
+
})
|
| 16 |
+
url!: string;
|
| 17 |
+
|
| 18 |
+
@Prop({
|
| 19 |
+
required: true
|
| 20 |
+
})
|
| 21 |
+
urlPathDigest!: string;
|
| 22 |
+
|
| 23 |
+
@Prop()
|
| 24 |
+
snapshot!: any;
|
| 25 |
+
|
| 26 |
+
@Prop()
|
| 27 |
+
createdAt!: Date;
|
| 28 |
+
|
| 29 |
+
@Prop()
|
| 30 |
+
expireAt!: Date;
|
| 31 |
+
|
| 32 |
+
static patchedFields = [
|
| 33 |
+
'snapshot'
|
| 34 |
+
];
|
| 35 |
+
|
| 36 |
+
static override from(input: any) {
|
| 37 |
+
for (const field of this.patchedFields) {
|
| 38 |
+
if (typeof input[field] === 'string') {
|
| 39 |
+
input[field] = parseJSONText(input[field]);
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
return super.from(input) as Crawled;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
override degradeForFireStore() {
|
| 47 |
+
const copy: any = { ...this };
|
| 48 |
+
|
| 49 |
+
for (const field of (this.constructor as typeof Crawled).patchedFields) {
|
| 50 |
+
if (typeof copy[field] === 'object') {
|
| 51 |
+
copy[field] = JSON.stringify(copy[field]) as any;
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
return copy;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
[k: string]: any;
|
| 59 |
+
}
|
backend/functions/src/index.ts
CHANGED
|
@@ -1,32 +1,31 @@
|
|
| 1 |
import 'reflect-metadata';
|
| 2 |
-
import * as functions from 'firebase-functions';
|
| 3 |
import { initializeApp } from 'firebase-admin/app';
|
| 4 |
initializeApp();
|
| 5 |
|
| 6 |
-
import secretExposer from './shared/services/secrets';
|
| 7 |
|
| 8 |
-
export const onUserCreated = functions
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
-
export const onUserLogin = functions
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
|
| 24 |
import { loadModulesDynamically, registry } from './shared';
|
| 25 |
import path from 'path';
|
| 26 |
loadModulesDynamically(path.resolve(__dirname, 'cloud-functions'));
|
| 27 |
|
|
|
|
| 28 |
Object.assign(exports, registry.exportGrouped({
|
| 29 |
-
memory: '
|
| 30 |
timeoutSeconds: 540,
|
| 31 |
}));
|
| 32 |
registry.title = 'url2text';
|
|
|
|
| 1 |
import 'reflect-metadata';
|
|
|
|
| 2 |
import { initializeApp } from 'firebase-admin/app';
|
| 3 |
initializeApp();
|
| 4 |
|
|
|
|
| 5 |
|
| 6 |
+
// export const onUserCreated = functions
|
| 7 |
+
// .runWith({ secrets: [...secretExposer.bundle], memory: '512MB' })
|
| 8 |
+
// .auth.user()
|
| 9 |
+
// .onCreate(async (user) => {
|
| 10 |
|
| 11 |
+
// return null;
|
| 12 |
+
// });
|
| 13 |
|
| 14 |
+
// export const onUserLogin = functions
|
| 15 |
+
// .runWith({ secrets: [...secretExposer.bundle], memory: '512MB' })
|
| 16 |
+
// .auth.user()
|
| 17 |
+
// .beforeSignIn(async (user, _ctx) => {
|
| 18 |
|
| 19 |
+
// return;
|
| 20 |
+
// });
|
| 21 |
|
| 22 |
import { loadModulesDynamically, registry } from './shared';
|
| 23 |
import path from 'path';
|
| 24 |
loadModulesDynamically(path.resolve(__dirname, 'cloud-functions'));
|
| 25 |
|
| 26 |
+
Object.assign(exports, registry.exportAll());
|
| 27 |
Object.assign(exports, registry.exportGrouped({
|
| 28 |
+
memory: '4GiB',
|
| 29 |
timeoutSeconds: 540,
|
| 30 |
}));
|
| 31 |
registry.title = 'url2text';
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -1,14 +1,36 @@
|
|
| 1 |
-
import { AsyncService, Defer } from 'civkit';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
import puppeteer, { Browser } from 'puppeteer';
|
| 4 |
import { Logger } from '../shared/services/logger';
|
| 5 |
import genericPool from 'generic-pool';
|
| 6 |
import os from 'os';
|
| 7 |
import fs from 'fs';
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
@singleton()
|
| 13 |
export class PuppeteerControl extends AsyncService {
|
| 14 |
|
|
@@ -24,11 +46,14 @@ export class PuppeteerControl extends AsyncService {
|
|
| 24 |
await page.browserContext().close();
|
| 25 |
},
|
| 26 |
validate: async (page) => {
|
| 27 |
-
return
|
| 28 |
}
|
| 29 |
}, {
|
| 30 |
-
max: Math.
|
| 31 |
-
min:
|
|
|
|
|
|
|
|
|
|
| 32 |
});
|
| 33 |
|
| 34 |
constructor(protected globalLogger: Logger) {
|
|
@@ -39,7 +64,11 @@ export class PuppeteerControl extends AsyncService {
|
|
| 39 |
await this.dependencyReady();
|
| 40 |
|
| 41 |
if (this.browser) {
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
}
|
| 44 |
this.browser = await puppeteer.launch({
|
| 45 |
headless: true,
|
|
@@ -49,6 +78,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 49 |
this.logger.warn(`Browser disconnected`);
|
| 50 |
this.emit('crippled');
|
| 51 |
});
|
|
|
|
| 52 |
|
| 53 |
this.emit('ready');
|
| 54 |
}
|
|
@@ -58,26 +88,33 @@ export class PuppeteerControl extends AsyncService {
|
|
| 58 |
const dedicatedContext = await this.browser.createBrowserContext();
|
| 59 |
|
| 60 |
const page = await dedicatedContext.newPage();
|
| 61 |
-
|
| 62 |
-
await page.setViewport({ width: 1920, height: 1080 });
|
| 63 |
-
await page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
| 64 |
-
page.emit('snapshot', snapshot);
|
| 65 |
-
});
|
| 66 |
-
|
| 67 |
-
await page.evaluateOnNewDocument(READABILITY_JS);
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
let aftershot: any;
|
| 75 |
const handlePageLoad = () => {
|
| 76 |
// @ts-expect-error
|
| 77 |
if (document.readyState !== 'complete' && document.readyState !== 'interactive') {
|
| 78 |
return;
|
| 79 |
}
|
| 80 |
-
|
| 81 |
const parsed = giveSnapshot();
|
| 82 |
if (parsed) {
|
| 83 |
// @ts-expect-error
|
|
@@ -97,16 +134,50 @@ export class PuppeteerControl extends AsyncService {
|
|
| 97 |
document.addEventListener('readystatechange', handlePageLoad);
|
| 98 |
// @ts-expect-error
|
| 99 |
document.addEventListener('load', handlePageLoad);
|
| 100 |
-
});
|
|
|
|
|
|
|
| 101 |
|
| 102 |
// TODO: further setup the page;
|
| 103 |
|
| 104 |
return page;
|
| 105 |
}
|
| 106 |
|
| 107 |
-
async *scrap(url: string) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
const page = await this.pagePool.acquire();
|
| 109 |
-
let snapshot: unknown;
|
| 110 |
let nextSnapshotDeferred = Defer();
|
| 111 |
let finalized = false;
|
| 112 |
const hdl = (s: any) => {
|
|
@@ -118,30 +189,57 @@ export class PuppeteerControl extends AsyncService {
|
|
| 118 |
nextSnapshotDeferred = Defer();
|
| 119 |
};
|
| 120 |
page.on('snapshot', hdl);
|
| 121 |
-
|
| 122 |
-
gotoPromise.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
try {
|
| 125 |
while (true) {
|
| 126 |
await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
|
| 127 |
-
const screenshot = await page.screenshot();
|
| 128 |
if (finalized) {
|
| 129 |
await gotoPromise;
|
| 130 |
-
|
| 131 |
-
yield { snapshot, screenshot };
|
| 132 |
break;
|
| 133 |
}
|
| 134 |
-
yield
|
| 135 |
}
|
| 136 |
-
} catch (_err) {
|
| 137 |
-
void 0;
|
| 138 |
} finally {
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
}
|
| 142 |
-
|
| 143 |
}
|
| 144 |
-
|
| 145 |
}
|
| 146 |
|
| 147 |
const puppeteerControl = container.resolve(PuppeteerControl);
|
|
|
|
| 1 |
+
import { AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
import puppeteer, { Browser } from 'puppeteer';
|
| 4 |
import { Logger } from '../shared/services/logger';
|
| 5 |
import genericPool from 'generic-pool';
|
| 6 |
import os from 'os';
|
| 7 |
import fs from 'fs';
|
| 8 |
+
import { Crawled } from '../db/crawled';
|
| 9 |
|
| 10 |
|
| 11 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
| 12 |
|
| 13 |
+
export interface PageSnapshot {
|
| 14 |
+
title: string;
|
| 15 |
+
href: string;
|
| 16 |
+
html: string;
|
| 17 |
+
text: string;
|
| 18 |
+
parsed: {
|
| 19 |
+
title: string;
|
| 20 |
+
content: string;
|
| 21 |
+
textContent: string;
|
| 22 |
+
length: number;
|
| 23 |
+
excerpt: string;
|
| 24 |
+
byline: string;
|
| 25 |
+
dir: string;
|
| 26 |
+
siteName: string;
|
| 27 |
+
lang: string;
|
| 28 |
+
publishedTime: string;
|
| 29 |
+
} | null;
|
| 30 |
+
screenshot?: Buffer;
|
| 31 |
+
}
|
| 32 |
+
const md5Hasher = new HashManager('md5', 'hex');
|
| 33 |
+
|
| 34 |
@singleton()
|
| 35 |
export class PuppeteerControl extends AsyncService {
|
| 36 |
|
|
|
|
| 46 |
await page.browserContext().close();
|
| 47 |
},
|
| 48 |
validate: async (page) => {
|
| 49 |
+
return page.browser().connected && !page.isClosed();
|
| 50 |
}
|
| 51 |
}, {
|
| 52 |
+
max: 1 + Math.floor(os.freemem() / 1024 * 1024 * 1024),
|
| 53 |
+
min: 1,
|
| 54 |
+
acquireTimeoutMillis: 15_000,
|
| 55 |
+
testOnBorrow: true,
|
| 56 |
+
testOnReturn: true,
|
| 57 |
});
|
| 58 |
|
| 59 |
constructor(protected globalLogger: Logger) {
|
|
|
|
| 64 |
await this.dependencyReady();
|
| 65 |
|
| 66 |
if (this.browser) {
|
| 67 |
+
if (this.browser.connected) {
|
| 68 |
+
await this.browser.close();
|
| 69 |
+
} else {
|
| 70 |
+
this.browser.process()?.kill();
|
| 71 |
+
}
|
| 72 |
}
|
| 73 |
this.browser = await puppeteer.launch({
|
| 74 |
headless: true,
|
|
|
|
| 78 |
this.logger.warn(`Browser disconnected`);
|
| 79 |
this.emit('crippled');
|
| 80 |
});
|
| 81 |
+
this.logger.info(`Browser launched: ${this.browser.process()?.pid}`);
|
| 82 |
|
| 83 |
this.emit('ready');
|
| 84 |
}
|
|
|
|
| 88 |
const dedicatedContext = await this.browser.createBrowserContext();
|
| 89 |
|
| 90 |
const page = await dedicatedContext.newPage();
|
| 91 |
+
const preparations = [];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
| 94 |
+
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
|
| 95 |
+
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
| 96 |
+
page.emit('snapshot', snapshot);
|
| 97 |
+
}));
|
| 98 |
+
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
| 99 |
+
preparations.push(page.evaluateOnNewDocument(`
|
| 100 |
+
function giveSnapshot() {
|
| 101 |
+
return {
|
| 102 |
+
title: document.title,
|
| 103 |
+
href: document.location.href,
|
| 104 |
+
html: document.documentElement.outerHTML,
|
| 105 |
+
text: document.body.innerText,
|
| 106 |
+
parsed: new Readability(document.cloneNode(true)).parse(),
|
| 107 |
+
};
|
| 108 |
+
}
|
| 109 |
+
`));
|
| 110 |
+
preparations.push(page.evaluateOnNewDocument(() => {
|
| 111 |
let aftershot: any;
|
| 112 |
const handlePageLoad = () => {
|
| 113 |
// @ts-expect-error
|
| 114 |
if (document.readyState !== 'complete' && document.readyState !== 'interactive') {
|
| 115 |
return;
|
| 116 |
}
|
| 117 |
+
// @ts-expect-error
|
| 118 |
const parsed = giveSnapshot();
|
| 119 |
if (parsed) {
|
| 120 |
// @ts-expect-error
|
|
|
|
| 134 |
document.addEventListener('readystatechange', handlePageLoad);
|
| 135 |
// @ts-expect-error
|
| 136 |
document.addEventListener('load', handlePageLoad);
|
| 137 |
+
}));
|
| 138 |
+
|
| 139 |
+
await Promise.all(preparations);
|
| 140 |
|
| 141 |
// TODO: further setup the page;
|
| 142 |
|
| 143 |
return page;
|
| 144 |
}
|
| 145 |
|
| 146 |
+
async *scrap(url: string, noCache: string | boolean = false) {
|
| 147 |
+
const parsedUrl = new URL(url);
|
| 148 |
+
parsedUrl.search = '';
|
| 149 |
+
parsedUrl.hash = '';
|
| 150 |
+
const normalizedUrl = parsedUrl.toString().toLowerCase();
|
| 151 |
+
const digest = md5Hasher.hash(normalizedUrl);
|
| 152 |
+
this.logger.info(`Scraping ${url}, normalized digest: ${digest}`, { url, digest });
|
| 153 |
+
|
| 154 |
+
let snapshot: PageSnapshot | undefined;
|
| 155 |
+
let screenshot: Buffer | undefined;
|
| 156 |
+
|
| 157 |
+
if (!noCache) {
|
| 158 |
+
const cached = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
| 159 |
+
|
| 160 |
+
if (cached && cached.createdAt.valueOf() > (Date.now() - 1000 * 300)) {
|
| 161 |
+
const age = Date.now() - cached.createdAt.valueOf();
|
| 162 |
+
this.logger.info(`Cache hit for ${url}, normalized digest: ${digest}, ${age}ms old`, { url, digest, age });
|
| 163 |
+
snapshot = {
|
| 164 |
+
...cached.snapshot
|
| 165 |
+
};
|
| 166 |
+
if (snapshot) {
|
| 167 |
+
delete snapshot.screenshot;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
screenshot = cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined;
|
| 171 |
+
yield {
|
| 172 |
+
...cached.snapshot,
|
| 173 |
+
screenshot: cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined
|
| 174 |
+
};
|
| 175 |
+
|
| 176 |
+
return;
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
const page = await this.pagePool.acquire();
|
|
|
|
| 181 |
let nextSnapshotDeferred = Defer();
|
| 182 |
let finalized = false;
|
| 183 |
const hdl = (s: any) => {
|
|
|
|
| 189 |
nextSnapshotDeferred = Defer();
|
| 190 |
};
|
| 191 |
page.on('snapshot', hdl);
|
| 192 |
+
|
| 193 |
+
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
| 194 |
+
.then(async (r) => {
|
| 195 |
+
screenshot = await page.screenshot({
|
| 196 |
+
type: 'jpeg',
|
| 197 |
+
quality: 85,
|
| 198 |
+
});
|
| 199 |
+
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
| 200 |
+
this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
|
| 201 |
+
const nowDate = new Date();
|
| 202 |
+
Crawled.save(
|
| 203 |
+
Crawled.from({
|
| 204 |
+
url,
|
| 205 |
+
createdAt: nowDate,
|
| 206 |
+
expireAt: new Date(nowDate.valueOf() + 1000 * 3600 * 24 * 7),
|
| 207 |
+
urlPathDigest: digest,
|
| 208 |
+
snapshot: { ...snapshot, screenshot: screenshot?.toString('base64') || '' },
|
| 209 |
+
}).degradeForFireStore()
|
| 210 |
+
).catch((err) => {
|
| 211 |
+
this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
|
| 212 |
+
});
|
| 213 |
+
|
| 214 |
+
return r;
|
| 215 |
+
});
|
| 216 |
+
|
| 217 |
+
gotoPromise.catch((err) => {
|
| 218 |
+
this.logger.warn(`Browsing of ${url} not fully done`, { err: marshalErrorLike(err) });
|
| 219 |
+
}).finally(() => {
|
| 220 |
+
finalized = true;
|
| 221 |
+
});
|
| 222 |
|
| 223 |
try {
|
| 224 |
while (true) {
|
| 225 |
await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
|
|
|
|
| 226 |
if (finalized) {
|
| 227 |
await gotoPromise;
|
| 228 |
+
|
| 229 |
+
yield { ...snapshot, screenshot };
|
| 230 |
break;
|
| 231 |
}
|
| 232 |
+
yield snapshot;
|
| 233 |
}
|
|
|
|
|
|
|
| 234 |
} finally {
|
| 235 |
+
gotoPromise.finally(() => {
|
| 236 |
+
page.off('snapshot', hdl);
|
| 237 |
+
this.pagePool.destroy(page).catch((err) => {
|
| 238 |
+
this.logger.warn(`Failed to destroy page`, { err: marshalErrorLike(err) });
|
| 239 |
+
});
|
| 240 |
+
});
|
| 241 |
}
|
|
|
|
| 242 |
}
|
|
|
|
| 243 |
}
|
| 244 |
|
| 245 |
const puppeteerControl = container.resolve(PuppeteerControl);
|