nomagick commited on
Commit
53bc91c
·
unverified ·
1 Parent(s): 22647a0

feat: compound response

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -310,6 +310,13 @@ export class CrawlerHost extends RPCHost {
310
  return formatted;
311
  }
312
 
 
 
 
 
 
 
 
313
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
314
  lastScrapped = scrapped;
315
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
@@ -600,22 +607,22 @@ export class CrawlerHost extends RPCHost {
600
  return 0;
601
  }
602
 
603
- const textContent = formatted?.content || formatted?.description || formatted?.text || formatted?.html;
604
  let amount = 0;
605
- do {
606
- if (typeof textContent === 'string') {
607
- amount = estimateToken(textContent);
608
- break;
609
- }
610
-
611
- const imageContent = formatted.screenshotUrl || formatted.screenshot;
612
-
613
- if (imageContent) {
614
- // OpenAI image token count for 1024x1024 image
615
- amount = 765;
616
- break;
617
- }
618
- } while (false);
 
619
 
620
  Object.assign(formatted, { usage: { tokens: amount } });
621
 
@@ -684,7 +691,7 @@ export class CrawlerHost extends RPCHost {
684
  const crawlOpts: ExtraScrappingOptions = {
685
  proxyUrl: opts.proxyUrl,
686
  cookies: opts.setCookies,
687
- favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
688
  removeSelector: opts.removeSelector,
689
  targetSelector: opts.targetSelector,
690
  waitForSelector: opts.waitForSelector,
 
310
  return formatted;
311
  }
312
 
313
+ if (crawlerOptions.isRequestingCompoundContentFormat()) {
314
+ throw new ParamValidationError({
315
+ path: 'respondWith',
316
+ message: `You are requesting compound content format, please explicitly accept 'text/event-stream' or 'application/json' in header.`
317
+ });
318
+ }
319
+
320
  for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
321
  lastScrapped = scrapped;
322
  if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
 
607
  return 0;
608
  }
609
 
 
610
  let amount = 0;
611
+ if (formatted.content) {
612
+ amount += estimateToken(formatted.content);
613
+ } else if (formatted.description) {
614
+ amount += estimateToken(formatted.description);
615
+ }
616
+ if (formatted.text) {
617
+ amount += estimateToken(formatted.text);
618
+ }
619
+ if (formatted.html) {
620
+ amount += estimateToken(formatted.html);
621
+ }
622
+ if (formatted.screenshotUrl || formatted.screenshot) {
623
+ // OpenAI image token count for 1024x1024 image
624
+ amount += 765;
625
+ }
626
 
627
  Object.assign(formatted, { usage: { tokens: amount } });
628
 
 
691
  const crawlOpts: ExtraScrappingOptions = {
692
  proxyUrl: opts.proxyUrl,
693
  cookies: opts.setCookies,
694
+ favorScreenshot: ['screenshot', 'pageshot'].some((x) => opts.respondWith.includes(x)),
695
  removeSelector: opts.removeSelector,
696
  targetSelector: opts.targetSelector,
697
  waitForSelector: opts.waitForSelector,
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -3,6 +3,16 @@ import type { Request, Response } from 'express';
3
  import type { CookieParam } from 'puppeteer';
4
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
5
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  @Also({
8
  openapi: {
@@ -35,7 +45,10 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
35
  `- html\n` +
36
  `- text\n` +
37
  `- pageshot\n` +
38
- `- screenshot\n`
 
 
 
39
  ,
40
  in: 'header',
41
  schema: { type: 'string' }
@@ -149,7 +162,8 @@ export class CrawlerOptions extends AutoCastable {
149
  pdf?: string;
150
 
151
  @Prop({
152
- default: 'default',
 
153
  })
154
  respondWith!: string;
155
 
@@ -372,6 +386,10 @@ export class CrawlerOptions extends AutoCastable {
372
 
373
  return true;
374
  }
 
 
 
 
375
  }
376
 
377
  export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
 
3
  import type { CookieParam } from 'puppeteer';
4
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
5
 
6
+ export enum CONTENT_FORMAT {
7
+ CONTENT = 'content',
8
+ MARKDOWN = 'markdown',
9
+ HTML = 'html',
10
+ TEXT = 'text',
11
+ PAGESHOT = 'pageshot',
12
+ SCREENSHOT = 'screenshot',
13
+ }
14
+
15
+ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
16
 
17
  @Also({
18
  openapi: {
 
45
  `- html\n` +
46
  `- text\n` +
47
  `- pageshot\n` +
48
+ `- screenshot\n` +
49
+ `- content\n` +
50
+ `- any combination of the above\n\n` +
51
+ `Default: content\n`
52
  ,
53
  in: 'header',
54
  schema: { type: 'string' }
 
162
  pdf?: string;
163
 
164
  @Prop({
165
+ default: CONTENT_FORMAT.CONTENT,
166
+ type: [CONTENT_FORMAT, String]
167
  })
168
  respondWith!: string;
169
 
 
386
 
387
  return true;
388
  }
389
+
390
+ isRequestingCompoundContentFormat() {
391
+ return !CONTENT_FORMAT_VALUES.has(this.respondWith);
392
+ }
393
  }
394
 
395
  export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
backend/functions/src/services/snapshot-formatter.ts CHANGED
@@ -73,7 +73,10 @@ export class SnapshotFormatter extends AsyncService {
73
  pageshotUrl?: string;
74
  }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
75
  const t0 = Date.now();
76
- if (mode === 'screenshot') {
 
 
 
77
  if (snapshot.screenshot && !snapshot.screenshotUrl) {
78
  const fid = `instant-screenshots/${randomUUID()}`;
79
  await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
@@ -84,20 +87,13 @@ export class SnapshotFormatter extends AsyncService {
84
  snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
85
  }
86
 
87
- const f = {
88
- ...this.getGeneralSnapshotMixins(snapshot),
89
- // html: snapshot.html,
90
  screenshotUrl: snapshot.screenshotUrl,
91
- };
92
-
93
- Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false });
94
-
95
- const dt = Date.now() - t0;
96
- this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
97
 
98
- return f as FormattedPage;
99
  }
100
- if (mode === 'pageshot') {
101
  if (snapshot.pageshot && !snapshot.pageshotUrl) {
102
  const fid = `instant-screenshots/${randomUUID()}`;
103
  await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
@@ -108,31 +104,18 @@ export class SnapshotFormatter extends AsyncService {
108
  snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
109
  }
110
 
111
- const f = {
112
- ...this.getGeneralSnapshotMixins(snapshot),
113
  html: snapshot.html,
114
  pageshotUrl: snapshot.pageshotUrl,
115
- } as FormattedPage;
116
-
117
- Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false });
118
-
119
- const dt = Date.now() - t0;
120
- this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
121
-
122
- return f;
123
  }
124
- if (mode === 'html') {
125
- const f = {
126
- ...this.getGeneralSnapshotMixins(snapshot),
127
  html: snapshot.html,
128
- } as FormattedPage;
129
-
130
- Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false });
131
-
132
- const dt = Date.now() - t0;
133
- this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
134
 
135
- return f;
136
  }
137
 
138
  let pdfMode = false;
@@ -157,19 +140,20 @@ export class SnapshotFormatter extends AsyncService {
157
  }
158
  }
159
 
160
- if (mode === 'text') {
161
- const f = {
162
- ...this.getGeneralSnapshotMixins(snapshot),
163
  text: snapshot.text,
164
- } as FormattedPage;
165
-
166
- Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false });
167
 
 
168
  const dt = Date.now() - t0;
169
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
170
 
171
  return f;
172
  }
 
173
  const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
174
 
175
  let contentText = '';
@@ -178,7 +162,7 @@ export class SnapshotFormatter extends AsyncService {
178
  const uid = this.threadLocal.get('uid');
179
  do {
180
  if (pdfMode) {
181
- contentText = snapshot.parsed?.content || snapshot.text;
182
  break;
183
  }
184
 
@@ -188,14 +172,14 @@ export class SnapshotFormatter extends AsyncService {
188
  snapshot.elemCount! > 70_000
189
  ) {
190
  this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
191
- contentText = snapshot.text;
192
  break;
193
  }
194
 
195
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
196
  let toBeTurnedToMd = jsDomElementOfHTML;
197
  let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
198
- if (mode !== 'markdown' && snapshot.parsed?.content) {
199
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
200
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
201
  const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
@@ -300,29 +284,27 @@ export class SnapshotFormatter extends AsyncService {
300
  ) {
301
  toBeTurnedToMd = jsDomElementOfHTML;
302
  try {
303
- contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
304
  } catch (err) {
305
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
306
  const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
307
  try {
308
- contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML);
309
  } catch (err2) {
310
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
311
  }
312
  }
313
  }
314
  if (this.isPoorlyTransformed(contentText, toBeTurnedToMd)) {
315
- contentText = snapshot.text;
316
  }
317
  } while (false);
318
 
319
- const cleanText = contentText?.includes('return') ? contentText.trimEnd() : (contentText || '').trim();
320
-
321
  const formatted: FormattedPage = {
322
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
323
  description: (snapshot.description || '').trim(),
324
  url: nominalUrl?.toString() || snapshot.href?.trim(),
325
- content: cleanText,
326
  publishedTime: snapshot.parsed?.publishedTime || undefined,
327
  [Symbol.dispose]: () => { },
328
  };
@@ -351,8 +333,10 @@ export class SnapshotFormatter extends AsyncService {
351
  formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
352
  }
353
 
 
 
354
  const textRepresentation = (function (this: typeof formatted) {
355
- if (mode === 'markdown') {
356
  return this.content as string;
357
  }
358
 
@@ -395,12 +379,12 @@ ${this.content}
395
  ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
396
  }).call(formatted);
397
 
398
- Object.defineProperty(formatted, 'textRepresentation', { value: textRepresentation, enumerable: false });
399
 
400
  const dt = Date.now() - t0;
401
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
402
 
403
- return formatted as FormattedPage;
404
  }
405
 
406
  getGeneralSnapshotMixins(snapshot: PageSnapshot) {
 
73
  pageshotUrl?: string;
74
  }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
75
  const t0 = Date.now();
76
+ const f = {
77
+ ...this.getGeneralSnapshotMixins(snapshot),
78
+ };
79
+ if (mode.includes('screenshot')) {
80
  if (snapshot.screenshot && !snapshot.screenshotUrl) {
81
  const fid = `instant-screenshots/${randomUUID()}`;
82
  await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
 
87
  snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
88
  }
89
 
90
+ Object.assign(f, {
 
 
91
  screenshotUrl: snapshot.screenshotUrl,
92
+ });
 
 
 
 
 
93
 
94
+ Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false, configurable: true });
95
  }
96
+ if (mode.includes('pageshot')) {
97
  if (snapshot.pageshot && !snapshot.pageshotUrl) {
98
  const fid = `instant-screenshots/${randomUUID()}`;
99
  await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
 
104
  snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
105
  }
106
 
107
+ Object.assign(f, {
 
108
  html: snapshot.html,
109
  pageshotUrl: snapshot.pageshotUrl,
110
+ });
111
+ Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false, configurable: true });
 
 
 
 
 
 
112
  }
113
+ if (mode.includes('html')) {
114
+ Object.assign(f, {
 
115
  html: snapshot.html,
116
+ });
 
 
 
 
 
117
 
118
+ Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false, configurable: true });
119
  }
120
 
121
  let pdfMode = false;
 
140
  }
141
  }
142
 
143
+ if (mode.includes('text')) {
144
+ Object.assign(f, {
 
145
  text: snapshot.text,
146
+ });
147
+ Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false, configurable: true });
148
+ }
149
 
150
+ if (!mode.includes('markdown') && !mode.includes('content')) {
151
  const dt = Date.now() - t0;
152
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
153
 
154
  return f;
155
  }
156
+
157
  const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
158
 
159
  let contentText = '';
 
162
  const uid = this.threadLocal.get('uid');
163
  do {
164
  if (pdfMode) {
165
+ contentText = (snapshot.parsed?.content || snapshot.text || '').trim();
166
  break;
167
  }
168
 
 
172
  snapshot.elemCount! > 70_000
173
  ) {
174
  this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
175
+ contentText = (snapshot.text || '').trimEnd();
176
  break;
177
  }
178
 
179
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
180
  let toBeTurnedToMd = jsDomElementOfHTML;
181
  let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
182
+ if (!mode.includes('markdown') && snapshot.parsed?.content) {
183
  const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
184
  const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
185
  const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
 
284
  ) {
285
  toBeTurnedToMd = jsDomElementOfHTML;
286
  try {
287
+ contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
288
  } catch (err) {
289
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
290
  const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
291
  try {
292
+ contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
293
  } catch (err2) {
294
  this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
295
  }
296
  }
297
  }
298
  if (this.isPoorlyTransformed(contentText, toBeTurnedToMd)) {
299
+ contentText = (snapshot.text || '').trimEnd();
300
  }
301
  } while (false);
302
 
 
 
303
  const formatted: FormattedPage = {
304
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
305
  description: (snapshot.description || '').trim(),
306
  url: nominalUrl?.toString() || snapshot.href?.trim(),
307
+ content: contentText,
308
  publishedTime: snapshot.parsed?.publishedTime || undefined,
309
  [Symbol.dispose]: () => { },
310
  };
 
333
  formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
334
  }
335
 
336
+ Object.assign(f, formatted);
337
+
338
  const textRepresentation = (function (this: typeof formatted) {
339
+ if (mode.includes('markdown')) {
340
  return this.content as string;
341
  }
342
 
 
379
  ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
380
  }).call(formatted);
381
 
382
+ Object.defineProperty(f, 'textRepresentation', { value: textRepresentation, enumerable: false });
383
 
384
  const dt = Date.now() - t0;
385
  this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
386
 
387
+ return f as FormattedPage;
388
  }
389
 
390
  getGeneralSnapshotMixins(snapshot: PageSnapshot) {
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 196d86ad8bc0cb6a9fee3c75df28a6a660c8f17e
 
1
+ Subproject commit 06cc23b16cafdd17c7e7db996f2167a39cc6d1eb