Spaces:

helloya20
/

chat

Configuration error

App Files Files Community

chat / packages /api /src /utils /tokenizer.spec.ts

helloya20

Upload 2345 files

f0743f4 verified 4 months ago

raw

history blame contribute delete

4.89 kB

	/**
	* @file Tokenizer.spec.cjs
	*
	* Tests the real TokenizerSingleton (no mocking of `tiktoken`).
	* Make sure to install `tiktoken` and have it configured properly.
	*/

	import { logger } from '@librechat/data-schemas';
	import type { Tiktoken } from 'tiktoken';
	import Tokenizer from './tokenizer';

	jest.mock('@librechat/data-schemas', () => ({
	logger: {
	error: jest.fn(),
	},
	}));

	describe('Tokenizer', () => {
	it('should be a singleton (same instance)', async () => {
	const AnotherTokenizer = await import('./tokenizer'); // same path
	expect(Tokenizer).toBe(AnotherTokenizer.default);
	});

	describe('getTokenizer', () => {
	it('should create an encoder for an explicit model name (e.g., "gpt-4")', () => {
	// The real `encoding_for_model` will be called internally
	// as soon as we pass isModelName = true.
	const tokenizer = Tokenizer.getTokenizer('gpt-4', true);

	// Basic sanity checks
	expect(tokenizer).toBeDefined();
	// You can optionally check certain properties from `tiktoken` if they exist
	// e.g., expect(typeof tokenizer.encode).toBe('function');
	});

	it('should create an encoder for a known encoding (e.g., "cl100k_base")', () => {
	// The real `get_encoding` will be called internally
	// as soon as we pass isModelName = false.
	const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);

	expect(tokenizer).toBeDefined();
	// e.g., expect(typeof tokenizer.encode).toBe('function');
	});

	it('should return cached tokenizer if previously fetched', () => {
	const tokenizer1 = Tokenizer.getTokenizer('cl100k_base', false);
	const tokenizer2 = Tokenizer.getTokenizer('cl100k_base', false);
	// Should be the exact same instance from the cache
	expect(tokenizer1).toBe(tokenizer2);
	});
	});

	describe('freeAndResetAllEncoders', () => {
	beforeEach(() => {
	jest.clearAllMocks();
	});

	it('should free all encoders and reset tokenizerCallsCount to 1', () => {
	// By creating two different encodings, we populate the cache
	Tokenizer.getTokenizer('cl100k_base', false);
	Tokenizer.getTokenizer('r50k_base', false);

	// Now free them
	Tokenizer.freeAndResetAllEncoders();

	// The internal cache is cleared
	expect(Tokenizer.tokenizersCache['cl100k_base']).toBeUndefined();
	expect(Tokenizer.tokenizersCache['r50k_base']).toBeUndefined();

	// tokenizerCallsCount is reset to 1
	expect(Tokenizer.tokenizerCallsCount).toBe(1);
	});

	it('should catch and log errors if freeing fails', () => {
	// Mock logger.error before the test
	const mockLoggerError = jest.spyOn(logger, 'error');

	// Set up a problematic tokenizer in the cache
	Tokenizer.tokenizersCache['cl100k_base'] = {
	free() {
	throw new Error('Intentional free error');
	},
	} as unknown as Tiktoken;

	// Should not throw uncaught errors
	Tokenizer.freeAndResetAllEncoders();

	// Verify logger.error was called with correct arguments
	expect(mockLoggerError).toHaveBeenCalledWith(
	'[Tokenizer] Free and reset encoders error',
	expect.any(Error),
	);

	// Clean up
	mockLoggerError.mockRestore();
	Tokenizer.tokenizersCache = {};
	});
	});

	describe('getTokenCount', () => {
	beforeEach(() => {
	jest.clearAllMocks();
	Tokenizer.freeAndResetAllEncoders();
	});

	it('should return the number of tokens in the given text', () => {
	const text = 'Hello, world!';
	const count = Tokenizer.getTokenCount(text, 'cl100k_base');
	expect(count).toBeGreaterThan(0);
	});

	it('should reset encoders if an error is thrown', () => {
	// We can simulate an error by temporarily overriding the selected tokenizer's `encode` method.
	const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
	const originalEncode = tokenizer.encode;
	tokenizer.encode = () => {
	throw new Error('Forced error');
	};

	// Despite the forced error, the code should catch and reset, then re-encode
	const count = Tokenizer.getTokenCount('Hello again', 'cl100k_base');
	expect(count).toBeGreaterThan(0);

	// Restore the original encode
	tokenizer.encode = originalEncode;
	});

	it('should reset tokenizers after 25 calls', () => {
	// Spy on freeAndResetAllEncoders
	const resetSpy = jest.spyOn(Tokenizer, 'freeAndResetAllEncoders');

	// Make 24 calls; should NOT reset yet
	for (let i = 0; i < 24; i++) {
	Tokenizer.getTokenCount('test text', 'cl100k_base');
	}
	expect(resetSpy).not.toHaveBeenCalled();

	// 25th call triggers the reset
	Tokenizer.getTokenCount('the 25th call!', 'cl100k_base');
	expect(resetSpy).toHaveBeenCalledTimes(1);
	});
	});
	});