Spaces:

bonesmasher
/

web_reader

Build error

App Files Files Community

nomagick commited on Apr 11, 2025

Commit

5f83d86

unverified ·

1 Parent(s): b6ac178

fix: encoding of from file snapshots

Browse files

Files changed (2) hide show

src/services/snapshot-formatter.ts +5 -4
src/utils/encoding.ts +34 -0

src/services/snapshot-formatter.ts CHANGED Viewed

@@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc';
 import _ from 'lodash';
 import { STATUS_CODES } from 'http';
 import type { CrawlerOptions } from '../dto/crawler-options';
-import { readFile } from 'fs/promises';
 import { pathToFileURL } from 'url';
 import { countGPTToken } from '../shared/utils/openai';
@@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
             overrideContentType = undefined;
         }
-        const contentType = (overrideContentType || await file.mimeType).toLowerCase();
         const fileName = overrideFileName || `${url.origin}${url.pathname}`;
         const snapshot: PageSnapshot = {
             title: '',
@@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
             return snapshot;
         }
         try {
             if (contentType.startsWith('text/html')) {
                 if ((await file.size) > 1024 * 1024 * 32) {
                     throw new AssertionFailureError(`Failed to access ${url}: file too large`);
                 }
-                snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
                 return snapshot;
             }
@@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
                 if ((await file.size) > 1024 * 1024 * 32) {
                     throw new AssertionFailureError(`Failed to access ${url}: file too large`);
                 }
-                snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
                 snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
                 return snapshot;

 import _ from 'lodash';
 import { STATUS_CODES } from 'http';
 import type { CrawlerOptions } from '../dto/crawler-options';
+import { readFile } from '../utils/encoding';
 import { pathToFileURL } from 'url';
 import { countGPTToken } from '../shared/utils/openai';
             overrideContentType = undefined;
         }
+        const contentType: string = (overrideContentType || await file.mimeType).toLowerCase();
         const fileName = overrideFileName || `${url.origin}${url.pathname}`;
         const snapshot: PageSnapshot = {
             title: '',
             return snapshot;
         }
         try {
+            const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
             if (contentType.startsWith('text/html')) {
                 if ((await file.size) > 1024 * 1024 * 32) {
                     throw new AssertionFailureError(`Failed to access ${url}: file too large`);
                 }
+                snapshot.html = await readFile(await file.filePath, encoding);
                 return snapshot;
             }
                 if ((await file.size) > 1024 * 1024 * 32) {
                     throw new AssertionFailureError(`Failed to access ${url}: file too large`);
                 }
+                snapshot.text = await readFile(await file.filePath, encoding);
                 snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
                 return snapshot;

src/utils/encoding.ts ADDED Viewed

	@@ -0,0 +1,34 @@

+import { createReadStream } from 'fs';
+import { Readable } from 'stream';
+import { TextDecoderStream } from 'stream/web';
+export async function decodeFileStream(
+    fileStream: Readable,
+    encoding: string = 'utf-8',
+): Promise<string> {
+    const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
+    Readable.toWeb(fileStream).pipeThrough(decodeStream);
+    const chunks = [];
+    for await (const chunk of decodeStream.readable) {
+        chunks.push(chunk);
+    }
+    return chunks.join('');
+}
+export async function readFile(
+    filePath: string,
+    encoding: string = 'utf-8',
+): Promise<string> {
+    const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
+    Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
+    const chunks = [];
+    for await (const chunk of decodeStream.readable) {
+        chunks.push(chunk);
+    }
+    return chunks.join('');
+}