| import os |
| from transformers import AutoTokenizer |
|
|
| os.environ['TOKENIZERS_PARALLELISM'] = "false" |
|
|
| list_repo_hf = ["databricks/dolly-v2-3b", |
| "gpt2", |
| "uer/gpt2-chinese-cluecorpussmall", |
| "EleutherAI/gpt-j-6b", |
| "EleutherAI/gpt-neox-20b", |
| "EleutherAI/polyglot-ko-1.3b", |
| "rinna/japanese-gpt-neox-3.6b", |
| |
| "replit/replit-code-v1-3b", |
| "bigcode/starcoder", |
| "openai/whisper-tiny" |
| ] |
|
|
| repo2ggml = {"databricks/dolly-v2-3b" : "dolly-v2", |
| "gpt2" : "gpt-2", |
| "uer/gpt2-chinese-cluecorpussmall" : "gpt-2-chinese", |
| "EleutherAI/gpt-j-6b" : "gpt-j", |
| "EleutherAI/gpt-neox-20b" : "gpt-neox", |
| "EleutherAI/polyglot-ko-1.3b" : "polyglot-ko", |
| "rinna/japanese-gpt-neox-3.6b" : "gpt-neox-japanese", |
| "replit/replit-code-v1-3b" : "replit", |
| "bigcode/starcoder" : "starcoder", |
| "openai/whisper-tiny" : "whisper"} |
|
|
| repo2language = {"databricks/dolly-v2-3b" : "english", |
| "gpt2" : "english", |
| "uer/gpt2-chinese-cluecorpussmall" : "chinese", |
| "EleutherAI/gpt-j-6b" : "english", |
| "EleutherAI/gpt-neox-20b" : "english", |
| "EleutherAI/polyglot-ko-1.3b" : "korean", |
| "rinna/japanese-gpt-neox-3.6b" : "japanese", |
| "replit/replit-code-v1-3b" : "english", |
| "bigcode/starcoder" : "english", |
| "openai/whisper-tiny" : "english"} |
|
|
| delimeter = ": " |
| test_sentences = [] |
| with open("test-cases.txt", "r") as f: |
| lines = [l.rstrip() for l in f.readlines()] |
| for l in lines: |
| if delimeter in l: |
| language = l[:l.index(delimeter)] |
| sentence = l[l.index(delimeter) + len(delimeter):] |
| test_sentences.append((language.lower(), sentence)) |
|
|
| for repo in list_repo_hf: |
|
|
| target_language = repo2language[repo] |
|
|
| tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True) |
|
|
| tokens_hf = [] |
| for language, sentence in test_sentences: |
| if language == target_language: |
| tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence)) |
| tokens_hf.append((sentence, tokens)) |
|
|
| save_txt = repo2ggml[repo] + ".txt" |
| with open(save_txt, "w") as f: |
| f.writelines([sentence + " => " + ",".join(str(t) for t in tokens) + "\n" for sentence, tokens in tokens_hf]) |
|
|