Spaces:
Build error
Build error
fix: encoding of from file snapshots
Browse files- src/services/snapshot-formatter.ts +5 -4
- src/utils/encoding.ts +34 -0
src/services/snapshot-formatter.ts
CHANGED
|
@@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc';
|
|
| 14 |
import _ from 'lodash';
|
| 15 |
import { STATUS_CODES } from 'http';
|
| 16 |
import type { CrawlerOptions } from '../dto/crawler-options';
|
| 17 |
-
import { readFile } from '
|
| 18 |
import { pathToFileURL } from 'url';
|
| 19 |
import { countGPTToken } from '../shared/utils/openai';
|
| 20 |
|
|
@@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 804 |
overrideContentType = undefined;
|
| 805 |
}
|
| 806 |
|
| 807 |
-
const contentType = (overrideContentType || await file.mimeType).toLowerCase();
|
| 808 |
const fileName = overrideFileName || `${url.origin}${url.pathname}`;
|
| 809 |
const snapshot: PageSnapshot = {
|
| 810 |
title: '',
|
|
@@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 821 |
return snapshot;
|
| 822 |
}
|
| 823 |
try {
|
|
|
|
| 824 |
if (contentType.startsWith('text/html')) {
|
| 825 |
if ((await file.size) > 1024 * 1024 * 32) {
|
| 826 |
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
| 827 |
}
|
| 828 |
-
snapshot.html = await readFile(await file.filePath,
|
| 829 |
|
| 830 |
return snapshot;
|
| 831 |
}
|
|
@@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 833 |
if ((await file.size) > 1024 * 1024 * 32) {
|
| 834 |
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
| 835 |
}
|
| 836 |
-
snapshot.text = await readFile(await file.filePath,
|
| 837 |
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
| 838 |
|
| 839 |
return snapshot;
|
|
|
|
| 14 |
import _ from 'lodash';
|
| 15 |
import { STATUS_CODES } from 'http';
|
| 16 |
import type { CrawlerOptions } from '../dto/crawler-options';
|
| 17 |
+
import { readFile } from '../utils/encoding';
|
| 18 |
import { pathToFileURL } from 'url';
|
| 19 |
import { countGPTToken } from '../shared/utils/openai';
|
| 20 |
|
|
|
|
| 804 |
overrideContentType = undefined;
|
| 805 |
}
|
| 806 |
|
| 807 |
+
const contentType: string = (overrideContentType || await file.mimeType).toLowerCase();
|
| 808 |
const fileName = overrideFileName || `${url.origin}${url.pathname}`;
|
| 809 |
const snapshot: PageSnapshot = {
|
| 810 |
title: '',
|
|
|
|
| 821 |
return snapshot;
|
| 822 |
}
|
| 823 |
try {
|
| 824 |
+
const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
|
| 825 |
if (contentType.startsWith('text/html')) {
|
| 826 |
if ((await file.size) > 1024 * 1024 * 32) {
|
| 827 |
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
| 828 |
}
|
| 829 |
+
snapshot.html = await readFile(await file.filePath, encoding);
|
| 830 |
|
| 831 |
return snapshot;
|
| 832 |
}
|
|
|
|
| 834 |
if ((await file.size) > 1024 * 1024 * 32) {
|
| 835 |
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
| 836 |
}
|
| 837 |
+
snapshot.text = await readFile(await file.filePath, encoding);
|
| 838 |
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
| 839 |
|
| 840 |
return snapshot;
|
src/utils/encoding.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { createReadStream } from 'fs';
|
| 2 |
+
import { Readable } from 'stream';
|
| 3 |
+
import { TextDecoderStream } from 'stream/web';
|
| 4 |
+
|
| 5 |
+
export async function decodeFileStream(
|
| 6 |
+
fileStream: Readable,
|
| 7 |
+
encoding: string = 'utf-8',
|
| 8 |
+
): Promise<string> {
|
| 9 |
+
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
| 10 |
+
Readable.toWeb(fileStream).pipeThrough(decodeStream);
|
| 11 |
+
const chunks = [];
|
| 12 |
+
|
| 13 |
+
for await (const chunk of decodeStream.readable) {
|
| 14 |
+
chunks.push(chunk);
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
return chunks.join('');
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
export async function readFile(
|
| 22 |
+
filePath: string,
|
| 23 |
+
encoding: string = 'utf-8',
|
| 24 |
+
): Promise<string> {
|
| 25 |
+
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
| 26 |
+
Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
|
| 27 |
+
const chunks = [];
|
| 28 |
+
|
| 29 |
+
for await (const chunk of decodeStream.readable) {
|
| 30 |
+
chunks.push(chunk);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
return chunks.join('');
|
| 34 |
+
}
|