nomagick commited on
Commit
5f83d86
·
unverified ·
1 Parent(s): b6ac178

fix: encoding of from file snapshots

Browse files
src/services/snapshot-formatter.ts CHANGED
@@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc';
14
  import _ from 'lodash';
15
  import { STATUS_CODES } from 'http';
16
  import type { CrawlerOptions } from '../dto/crawler-options';
17
- import { readFile } from 'fs/promises';
18
  import { pathToFileURL } from 'url';
19
  import { countGPTToken } from '../shared/utils/openai';
20
 
@@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
804
  overrideContentType = undefined;
805
  }
806
 
807
- const contentType = (overrideContentType || await file.mimeType).toLowerCase();
808
  const fileName = overrideFileName || `${url.origin}${url.pathname}`;
809
  const snapshot: PageSnapshot = {
810
  title: '',
@@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
821
  return snapshot;
822
  }
823
  try {
 
824
  if (contentType.startsWith('text/html')) {
825
  if ((await file.size) > 1024 * 1024 * 32) {
826
  throw new AssertionFailureError(`Failed to access ${url}: file too large`);
827
  }
828
- snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
829
 
830
  return snapshot;
831
  }
@@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
833
  if ((await file.size) > 1024 * 1024 * 32) {
834
  throw new AssertionFailureError(`Failed to access ${url}: file too large`);
835
  }
836
- snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
837
  snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
838
 
839
  return snapshot;
 
14
  import _ from 'lodash';
15
  import { STATUS_CODES } from 'http';
16
  import type { CrawlerOptions } from '../dto/crawler-options';
17
+ import { readFile } from '../utils/encoding';
18
  import { pathToFileURL } from 'url';
19
  import { countGPTToken } from '../shared/utils/openai';
20
 
 
804
  overrideContentType = undefined;
805
  }
806
 
807
+ const contentType: string = (overrideContentType || await file.mimeType).toLowerCase();
808
  const fileName = overrideFileName || `${url.origin}${url.pathname}`;
809
  const snapshot: PageSnapshot = {
810
  title: '',
 
821
  return snapshot;
822
  }
823
  try {
824
+ const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
825
  if (contentType.startsWith('text/html')) {
826
  if ((await file.size) > 1024 * 1024 * 32) {
827
  throw new AssertionFailureError(`Failed to access ${url}: file too large`);
828
  }
829
+ snapshot.html = await readFile(await file.filePath, encoding);
830
 
831
  return snapshot;
832
  }
 
834
  if ((await file.size) > 1024 * 1024 * 32) {
835
  throw new AssertionFailureError(`Failed to access ${url}: file too large`);
836
  }
837
+ snapshot.text = await readFile(await file.filePath, encoding);
838
  snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
839
 
840
  return snapshot;
src/utils/encoding.ts ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { createReadStream } from 'fs';
2
+ import { Readable } from 'stream';
3
+ import { TextDecoderStream } from 'stream/web';
4
+
5
+ export async function decodeFileStream(
6
+ fileStream: Readable,
7
+ encoding: string = 'utf-8',
8
+ ): Promise<string> {
9
+ const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
10
+ Readable.toWeb(fileStream).pipeThrough(decodeStream);
11
+ const chunks = [];
12
+
13
+ for await (const chunk of decodeStream.readable) {
14
+ chunks.push(chunk);
15
+ }
16
+
17
+ return chunks.join('');
18
+ }
19
+
20
+
21
+ export async function readFile(
22
+ filePath: string,
23
+ encoding: string = 'utf-8',
24
+ ): Promise<string> {
25
+ const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
26
+ Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
27
+ const chunks = [];
28
+
29
+ for await (const chunk of decodeStream.readable) {
30
+ chunks.push(chunk);
31
+ }
32
+
33
+ return chunks.join('');
34
+ }