Spaces:

iblfe
/

test

Runtime error

App Files Files Community

test / tests /test_tokenizer.py

iblfe

Upload folder using huggingface_hub

b585c7f verified about 2 years ago

raw

history blame contribute delete

4.45 kB

	import os

	import nltk
	import pytest

	from tests.utils import wrap_test_forked


	def nltkTokenize(text):
	words = nltk.word_tokenize(text)
	return words


	import re

	WORD = re.compile(r'\w+')


	def regTokenize(text):
	words = WORD.findall(text)
	return words


	import time


	@pytest.mark.skipif(not os.getenv('MEASURE'),
	reason="For checking token length for various methods: MEASURE=1 pytest -s -v tests/test_tokenizer.py")
	@wrap_test_forked
	def test_tokenizer1():
	prompt = """Here is an example of how to write a Python program to generate the Fibonacci sequence:




	def fib(n):
	a, b = 0, 1
	if n == 0 or n == 1:
	return a
	for i in range(n-2):
	a, b = b, a+b
	return b

	for i in range(10):
	print(fib(i))
	This program defines a function called fib that takes an integer n as input and returns the nth Fibonacci number. The function uses two variables a and b to keep track of the current and previous Fibonacci numbers.

	The first two lines of the function check if n is either 0 or 1, in which case the function returns 0 or 1 respectively. If n is greater than 1, the function iterates over the range of integers from 2 to n-1, adding the previous two Fibonacci numbers to get the current Fibonacci number. Finally, the function returns the last Fibonacci number calculated.

	In the main part of the program, we use a for loop to call the fib function with different"""

	prompt = os.getenv('PROMPT', prompt)
	run_tokenizer1(prompt)


	def run_tokenizer1(prompt):
	from transformers import AutoTokenizer

	t = AutoTokenizer.from_pretrained("distilgpt2")
	llm_tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-oig-oasst1-512-6_9b')

	from InstructorEmbedding import INSTRUCTOR
	emb = INSTRUCTOR('hkunlp/instructor-large')

	t0 = time.time()
	a = len(regTokenize(prompt))
	print("Regexp Tokenizer", a, time.time() - t0)

	t0 = time.time()
	a = len(nltkTokenize(prompt))
	print("NLTK Tokenizer", a, time.time() - t0)

	t0 = time.time()
	a = len(t(prompt)['input_ids'])
	print("Slow Tokenizer", a, time.time() - t0)

	t0 = time.time()
	a = len(llm_tokenizer(prompt)['input_ids'])
	print("Fast Tokenizer LLM", a, time.time() - t0)

	t0 = time.time()
	a = emb.tokenize([prompt])['input_ids'].shape[1]
	print("Instruct Embedding", a, time.time() - t0)


	@wrap_test_forked
	def test_fake_tokenizer():
	from src.utils import FakeTokenizer
	t = FakeTokenizer()
	assert t.num_tokens_from_string('How are you?') == 4
	assert t.num_tokens_from_string('<\|endoftext\|>') == 7
	try:
	t.encoding.encode('<\|endoftext\|>')
	raise RuntimeError("Shouldn't reach here")
	except ValueError as e:
	assert "disallowed special token" in str(e)


	@wrap_test_forked
	def test_tokenizer_base_model1():
	# test separate tokenizer
	from tests.test_langchain_units import get_test_model
	model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-beta',
	tokenizer_base_model='amazon/MistralLite',
	prompt_type='human_bot')
	assert 'MistralForCausalLM' in str(model)
	assert 'amazon/MistralLite' in str(tokenizer)
	assert prompt_type == 'human_bot'
	print("here")


	@wrap_test_forked
	def test_tokenizer_base_model2():
	# separate tokenizer for vllm, so don't have to share full model, just proxy tokenizer
	# if vllm endpoint, we shouldn't fail at all if have invalid base model
	from tests.test_langchain_units import get_test_model
	model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-omega',
	tokenizer_base_model='amazon/MistralLite',
	prompt_type='human_bot',
	inference_server="vllm:localhost:8080",
	max_seq_len=4096)
	assert model['base_url'] == 'http://localhost:8080/v1'
	assert 'amazon/MistralLite' in str(tokenizer)
	assert prompt_type == 'human_bot'
	print("here")


	if __name__ == '__main__':
	test_tokenizer1()