| |
| |
| |
| |
| |
| |
|
|
| import { logger } from '@librechat/data-schemas'; |
| import type { Tiktoken } from 'tiktoken'; |
| import Tokenizer from './tokenizer'; |
|
|
| jest.mock('@librechat/data-schemas', () => ({ |
| logger: { |
| error: jest.fn(), |
| }, |
| })); |
|
|
| describe('Tokenizer', () => { |
| it('should be a singleton (same instance)', async () => { |
| const AnotherTokenizer = await import('./tokenizer'); |
| expect(Tokenizer).toBe(AnotherTokenizer.default); |
| }); |
|
|
| describe('getTokenizer', () => { |
| it('should create an encoder for an explicit model name (e.g., "gpt-4")', () => { |
| |
| |
| const tokenizer = Tokenizer.getTokenizer('gpt-4', true); |
|
|
| |
| expect(tokenizer).toBeDefined(); |
| |
| |
| }); |
|
|
| it('should create an encoder for a known encoding (e.g., "cl100k_base")', () => { |
| |
| |
| const tokenizer = Tokenizer.getTokenizer('cl100k_base', false); |
|
|
| expect(tokenizer).toBeDefined(); |
| |
| }); |
|
|
| it('should return cached tokenizer if previously fetched', () => { |
| const tokenizer1 = Tokenizer.getTokenizer('cl100k_base', false); |
| const tokenizer2 = Tokenizer.getTokenizer('cl100k_base', false); |
| |
| expect(tokenizer1).toBe(tokenizer2); |
| }); |
| }); |
|
|
| describe('freeAndResetAllEncoders', () => { |
| beforeEach(() => { |
| jest.clearAllMocks(); |
| }); |
|
|
| it('should free all encoders and reset tokenizerCallsCount to 1', () => { |
| |
| Tokenizer.getTokenizer('cl100k_base', false); |
| Tokenizer.getTokenizer('r50k_base', false); |
|
|
| |
| Tokenizer.freeAndResetAllEncoders(); |
|
|
| |
| expect(Tokenizer.tokenizersCache['cl100k_base']).toBeUndefined(); |
| expect(Tokenizer.tokenizersCache['r50k_base']).toBeUndefined(); |
|
|
| |
| expect(Tokenizer.tokenizerCallsCount).toBe(1); |
| }); |
|
|
| it('should catch and log errors if freeing fails', () => { |
| |
| const mockLoggerError = jest.spyOn(logger, 'error'); |
|
|
| |
| Tokenizer.tokenizersCache['cl100k_base'] = { |
| free() { |
| throw new Error('Intentional free error'); |
| }, |
| } as unknown as Tiktoken; |
|
|
| |
| Tokenizer.freeAndResetAllEncoders(); |
|
|
| |
| expect(mockLoggerError).toHaveBeenCalledWith( |
| '[Tokenizer] Free and reset encoders error', |
| expect.any(Error), |
| ); |
|
|
| |
| mockLoggerError.mockRestore(); |
| Tokenizer.tokenizersCache = {}; |
| }); |
| }); |
|
|
| describe('getTokenCount', () => { |
| beforeEach(() => { |
| jest.clearAllMocks(); |
| Tokenizer.freeAndResetAllEncoders(); |
| }); |
|
|
| it('should return the number of tokens in the given text', () => { |
| const text = 'Hello, world!'; |
| const count = Tokenizer.getTokenCount(text, 'cl100k_base'); |
| expect(count).toBeGreaterThan(0); |
| }); |
|
|
| it('should reset encoders if an error is thrown', () => { |
| |
| const tokenizer = Tokenizer.getTokenizer('cl100k_base', false); |
| const originalEncode = tokenizer.encode; |
| tokenizer.encode = () => { |
| throw new Error('Forced error'); |
| }; |
|
|
| |
| const count = Tokenizer.getTokenCount('Hello again', 'cl100k_base'); |
| expect(count).toBeGreaterThan(0); |
|
|
| |
| tokenizer.encode = originalEncode; |
| }); |
|
|
| it('should reset tokenizers after 25 calls', () => { |
| |
| const resetSpy = jest.spyOn(Tokenizer, 'freeAndResetAllEncoders'); |
|
|
| |
| for (let i = 0; i < 24; i++) { |
| Tokenizer.getTokenCount('test text', 'cl100k_base'); |
| } |
| expect(resetSpy).not.toHaveBeenCalled(); |
|
|
| |
| Tokenizer.getTokenCount('the 25th call!', 'cl100k_base'); |
| expect(resetSpy).toHaveBeenCalledTimes(1); |
| }); |
| }); |
| }); |
|
|