| from transformers import AutoTokenizer |
| from transformers import AutoModelForCausalLM |
| import json |
| |
| |
|
|
|
|
|
|
|
|
| def gen_special_tokens_json(): |
| special_tokens_list = {} |
| for i in range(7): |
| special_tokens_list[f"{i}"] = f"\n<remaining>{i+1}/8</remaining>\n" |
| print(special_tokens_list) |
| |
| with open('./special_tokens.json', 'w') as f: |
| json.dump(special_tokens_list, f) |
| print('special_tokens.json has been generated.') |
|
|
| if __name__ == "__main__": |
| |
| ori_model_path = '/path/to/your/ori/model' |
| new_model_path = '/path/to/your/new/model' |
| |
| model = AutoModelForCausalLM.from_pretrained(ori_model_path) |
| tokenizer = AutoTokenizer.from_pretrained(ori_model_path) |
| print(model.get_input_embeddings()) |
| print(model.lm_head) |
| print(len(tokenizer)) |
|
|
| gen_special_tokens_json() |
| with open('./special_tokens.json') as f: |
| special_tokens = json.load(f) |
| |
| bins_tokens = [ |
| special_tokens[f"{i}"] for i in range(7) |
| ] |
|
|
| tokenizer.add_special_tokens({'additional_special_tokens': bins_tokens}) |
| model.resize_token_embeddings(len(tokenizer)) |
| print('Vocab size after adding special tokens:', len(tokenizer)) |
|
|
|
|
|
|
| tokenizer.save_pretrained(new_model_path) |
| model.save_pretrained(new_model_path) |
| model = AutoModelForCausalLM.from_pretrained(new_model_path) |
| tokenizer = AutoTokenizer.from_pretrained(new_model_path) |
| print(model.get_input_embeddings()) |
| print(model.lm_head) |
| print(len(tokenizer)) |
|
|