nomagick commited on
Commit
165cce6
·
unverified ·
1 Parent(s): f0668a9

refactor: options dto

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -470,90 +470,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
470
  concurrency: 22,
471
  maxInstances: 455,
472
  },
473
- openapi: {
474
- operation: {
475
- parameters: {
476
- 'Accept': {
477
- description: `Specifies your preference for the response format.\n\n` +
478
- `Supported formats: \n` +
479
- `- text/event-stream\n` +
480
- `- application/json or text/json\n` +
481
- `- text/plain`
482
- ,
483
- in: 'header',
484
- schema: { type: 'string' }
485
- },
486
- 'X-Cache-Tolerance': {
487
- description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
488
- in: 'header',
489
- schema: { type: 'string' }
490
- },
491
- 'X-No-Cache': {
492
- description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
493
- in: 'header',
494
- schema: { type: 'string' }
495
- },
496
- 'X-Respond-With': {
497
- description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
498
- `Supported formats: \n` +
499
- `- markdown\n` +
500
- `- html\n` +
501
- `- text\n` +
502
- `- screenshot\n`
503
- ,
504
- in: 'header',
505
- schema: { type: 'string' }
506
- },
507
- 'X-Wait-For-Selector': {
508
- description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
509
- 'Example: `X-Wait-For-Selector: .content-block`\n'
510
- ,
511
- in: 'header',
512
- schema: { type: 'string' }
513
- },
514
- 'X-Target-Selector': {
515
- description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
516
- 'Implies `X-Wait-For-Selector: (same selector)`'
517
- ,
518
- in: 'header',
519
- schema: { type: 'string' }
520
- },
521
- 'X-Proxy-Url': {
522
- description: `Specifies your custom proxy if you prefer to use one.\n\n` +
523
- `Supported protocols: \n` +
524
- `- http\n` +
525
- `- https\n` +
526
- `- socks4\n` +
527
- `- socks5\n\n` +
528
- `For authentication, https://user:pass@host:port`,
529
- in: 'header',
530
- schema: { type: 'string' }
531
- },
532
- 'X-Set-Cookie': {
533
- description: `Sets cookie(s) to the headless browser for your request. \n\n` +
534
- `Syntax is the same with standard Set-Cookie`,
535
- in: 'header',
536
- schema: { type: 'string' }
537
- },
538
- 'X-With-Generated-Alt': {
539
- description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
540
- `Note: Does not work when \`X-Respond-With\` is specified`,
541
- in: 'header',
542
- schema: { type: 'string' }
543
- },
544
- 'X-With-Images-Summary': {
545
- description: `Enable dedicated summary section for images on the page.`,
546
- in: 'header',
547
- schema: { type: 'string' }
548
- },
549
- 'X-With-links-Summary': {
550
- description: `Enable dedicated summary section for hyper links on the page.`,
551
- in: 'header',
552
- schema: { type: 'string' }
553
- },
554
- }
555
- }
556
- },
557
  tags: ['Crawler'],
558
  httpMethod: ['get', 'post'],
559
  returnType: [String, OutputServerEventStream],
@@ -953,6 +869,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
953
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
954
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
955
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
 
956
 
957
  const crawlOpts: ExtraScrappingOptions = {
958
  proxyUrl: opts.proxyUrl,
@@ -960,6 +877,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
960
  favorScreenshot: opts.respondWith === 'screenshot',
961
  waitForSelector: opts.waitForSelector,
962
  targetSelector: opts.targetSelector,
 
963
  };
964
 
965
  return crawlOpts;
 
470
  concurrency: 22,
471
  maxInstances: 455,
472
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  tags: ['Crawler'],
474
  httpMethod: ['get', 'post'],
475
  returnType: [String, OutputServerEventStream],
 
869
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
870
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
871
  this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
872
+ this.threadLocal.set('userAgent', opts.userAgent);
873
 
874
  const crawlOpts: ExtraScrappingOptions = {
875
  proxyUrl: opts.proxyUrl,
 
877
  favorScreenshot: opts.respondWith === 'screenshot',
878
  waitForSelector: opts.waitForSelector,
879
  targetSelector: opts.targetSelector,
880
+ overrideUserAgent: opts.userAgent,
881
  };
882
 
883
  return crawlOpts;
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -71,71 +71,6 @@ export class SearcherHost extends RPCHost {
71
  concurrency: 6,
72
  maxInstances: 200,
73
  },
74
- openapi: {
75
- operation: {
76
- parameters: {
77
- 'Accept': {
78
- description: `Specifies your preference for the response format. \n\n` +
79
- `Supported formats:\n` +
80
- `- text/event-stream\n` +
81
- `- application/json or text/json\n` +
82
- `- text/plain`
83
- ,
84
- in: 'header',
85
- schema: { type: 'string' }
86
- },
87
- 'X-No-Cache': {
88
- description: `Ignores internal cache if this header is specified with a value.`,
89
- in: 'header',
90
- schema: { type: 'string' }
91
- },
92
- 'X-Respond-With': {
93
- description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
94
- `Supported formats:\n` +
95
- `- markdown\n` +
96
- `- html\n` +
97
- `- text\n` +
98
- `- screenshot\n`
99
- ,
100
- in: 'header',
101
- schema: { type: 'string' }
102
- },
103
- 'X-Proxy-Url': {
104
- description: `Specifies your custom proxy if you prefer to use one. \n\n` +
105
- `Supported protocols:\n` +
106
- `- http\n` +
107
- `- https\n` +
108
- `- socks4\n` +
109
- `- socks5\n\n` +
110
- `For authentication, https://user:pass@host:port`,
111
- in: 'header',
112
- schema: { type: 'string' }
113
- },
114
- 'X-Set-Cookie': {
115
- description: `Sets cookie(s) to the headless browser for your request. \n\n` +
116
- `Syntax is the same with standard Set-Cookie`,
117
- in: 'header',
118
- schema: { type: 'string' }
119
- },
120
- 'X-With-Generated-Alt': {
121
- description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
122
- `Note: Does not work when \`X-Respond-With\` is specified`,
123
- in: 'header',
124
- schema: { type: 'string' }
125
- },
126
- 'X-With-Images-Summary': {
127
- description: `Enable dedicated summary section for images on the page.`,
128
- in: 'header',
129
- schema: { type: 'string' }
130
- },
131
- 'X-With-links-Summary': {
132
- description: `Enable dedicated summary section for hyper links on the page.`,
133
- in: 'header',
134
- schema: { type: 'string' }
135
- },
136
- }
137
- }
138
- },
139
  tags: ['Searcher'],
140
  httpMethod: ['get', 'post'],
141
  returnType: [String, OutputServerEventStream],
 
71
  concurrency: 6,
72
  maxInstances: 200,
73
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  tags: ['Searcher'],
75
  httpMethod: ['get', 'post'],
76
  returnType: [String, OutputServerEventStream],
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -1,8 +1,100 @@
1
- import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
  import type { Request, Response } from 'express';
3
  import type { CookieParam } from 'puppeteer';
4
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  export class CrawlerOptions extends AutoCastable {
7
 
8
  @Prop({
@@ -47,6 +139,9 @@ export class CrawlerOptions extends AutoCastable {
47
  @Prop()
48
  proxyUrl?: string;
49
 
 
 
 
50
  static override from(input: any) {
51
  const instance = super.from(input) as CrawlerOptions;
52
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@@ -87,6 +182,8 @@ export class CrawlerOptions extends AutoCastable {
87
  instance.targetSelector ??= targetSelector;
88
  const waitForSelector = ctx?.req.get('x-wait-for-selector');
89
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
 
 
90
 
91
  const cookies: CookieParam[] = [];
92
  const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
 
1
+ import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
2
  import type { Request, Response } from 'express';
3
  import type { CookieParam } from 'puppeteer';
4
  import { parseString as parseSetCookieString } from 'set-cookie-parser';
5
 
6
+
7
+ @Also({
8
+ openapi: {
9
+ operation: {
10
+ parameters: {
11
+ 'Accept': {
12
+ description: `Specifies your preference for the response format.\n\n` +
13
+ `Supported formats: \n` +
14
+ `- text/event-stream\n` +
15
+ `- application/json or text/json\n` +
16
+ `- text/plain`
17
+ ,
18
+ in: 'header',
19
+ schema: { type: 'string' }
20
+ },
21
+ 'X-Cache-Tolerance': {
22
+ description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
23
+ in: 'header',
24
+ schema: { type: 'string' }
25
+ },
26
+ 'X-No-Cache': {
27
+ description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
28
+ in: 'header',
29
+ schema: { type: 'string' }
30
+ },
31
+ 'X-Respond-With': {
32
+ description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
33
+ `Supported formats: \n` +
34
+ `- markdown\n` +
35
+ `- html\n` +
36
+ `- text\n` +
37
+ `- screenshot\n`
38
+ ,
39
+ in: 'header',
40
+ schema: { type: 'string' }
41
+ },
42
+ 'X-Wait-For-Selector': {
43
+ description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
44
+ 'Example: `X-Wait-For-Selector: .content-block`\n'
45
+ ,
46
+ in: 'header',
47
+ schema: { type: 'string' }
48
+ },
49
+ 'X-Target-Selector': {
50
+ description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
51
+ 'Implies `X-Wait-For-Selector: (same selector)`'
52
+ ,
53
+ in: 'header',
54
+ schema: { type: 'string' }
55
+ },
56
+ 'X-Proxy-Url': {
57
+ description: `Specifies your custom proxy if you prefer to use one.\n\n` +
58
+ `Supported protocols: \n` +
59
+ `- http\n` +
60
+ `- https\n` +
61
+ `- socks4\n` +
62
+ `- socks5\n\n` +
63
+ `For authentication, https://user:pass@host:port`,
64
+ in: 'header',
65
+ schema: { type: 'string' }
66
+ },
67
+ 'X-Set-Cookie': {
68
+ description: `Sets cookie(s) to the headless browser for your request. \n\n` +
69
+ `Syntax is the same with standard Set-Cookie`,
70
+ in: 'header',
71
+ schema: { type: 'string' }
72
+ },
73
+ 'X-With-Generated-Alt': {
74
+ description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
75
+ `Note: Does not work when \`X-Respond-With\` is specified`,
76
+ in: 'header',
77
+ schema: { type: 'string' }
78
+ },
79
+ 'X-With-Images-Summary': {
80
+ description: `Enable dedicated summary section for images on the page.`,
81
+ in: 'header',
82
+ schema: { type: 'string' }
83
+ },
84
+ 'X-With-links-Summary': {
85
+ description: `Enable dedicated summary section for hyper links on the page.`,
86
+ in: 'header',
87
+ schema: { type: 'string' }
88
+ },
89
+ 'X-User-Agent': {
90
+ description: `Override User-Agent.`,
91
+ in: 'header',
92
+ schema: { type: 'string' }
93
+ },
94
+ }
95
+ }
96
+ }
97
+ })
98
  export class CrawlerOptions extends AutoCastable {
99
 
100
  @Prop({
 
139
  @Prop()
140
  proxyUrl?: string;
141
 
142
+ @Prop()
143
+ userAgent?: string;
144
+
145
  static override from(input: any) {
146
  const instance = super.from(input) as CrawlerOptions;
147
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
 
182
  instance.targetSelector ??= targetSelector;
183
  const waitForSelector = ctx?.req.get('x-wait-for-selector');
184
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
185
+ const overrideUserAgent = ctx?.req.get('x-user-agent');
186
+ instance.userAgent ??= overrideUserAgent;
187
 
188
  const cookies: CookieParam[] = [];
189
  const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
backend/functions/src/services/puppeteer.ts CHANGED
@@ -65,6 +65,7 @@ export interface ScrappingOptions {
65
  favorScreenshot?: boolean;
66
  waitForSelector?: string;
67
  minIntervalMs?: number;
 
68
  }
69
 
70
 
@@ -417,6 +418,9 @@ document.addEventListener('load', handlePageLoad);
417
  if (options?.cookies) {
418
  await page.setCookie(...options.cookies);
419
  }
 
 
 
420
 
421
  let nextSnapshotDeferred = Defer();
422
  const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
 
65
  favorScreenshot?: boolean;
66
  waitForSelector?: string;
67
  minIntervalMs?: number;
68
+ overrideUserAgent?: string;
69
  }
70
 
71
 
 
418
  if (options?.cookies) {
419
  await page.setCookie(...options.cookies);
420
  }
421
+ if (options?.overrideUserAgent) {
422
+ await page.setUserAgent(options.overrideUserAgent);
423
+ }
424
 
425
  let nextSnapshotDeferred = Defer();
426
  const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1
 
1
+ Subproject commit a6116b73e99e3d335b0cd4cfcae8f4f0c7e72f6d