| import os, json, itertools, bisect, gc |
|
|
| from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig |
| import transformers |
| import torch |
| from accelerate import Accelerator |
| import accelerate |
| import time |
| from huggingface_hub import login |
|
|
|
|
|
|
| |
| |
| |
| hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN") |
|
|
| model = None |
| tokenizer = None |
| generator = None |
|
|
| def load_model(model_name, eight_bit=0, device_map="auto"): |
| global model, tokenizer, generator |
|
|
| print("Loading "+model_name+"...") |
|
|
| if device_map == "zero": |
| device_map = "balanced_low_0" |
|
|
| |
| gpu_count = torch.cuda.device_count() |
| print('gpu_count', gpu_count) |
|
|
| tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, token=hf_token) |
| model = transformers.LlamaForCausalLM.from_pretrained( |
| model_name, |
| |
| |
| torch_dtype=torch.float16, |
| |
| |
| low_cpu_mem_usage=True, |
| load_in_8bit=False, |
| cache_dir="cache", |
| token=hf_token |
| ).cuda() |
|
|
| generator = model.generate |
|
|
| load_model("Muhammadidrees/JayConverstionalModel") |
|
|
| history = [] |
|
|
| def go(): |
| invitation = "Assistant: " |
| human_invitation = "Human: " |
|
|
| |
| msg = input(human_invitation) |
| print("") |
|
|
| history.append(human_invitation + msg) |
|
|
| fulltext = "\n\n".join(history) + "\n\n" + invitation |
|
|
| |
| |
| |
|
|
| generated_text = "" |
| gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda() |
| in_tokens = len(gen_in) |
| with torch.no_grad(): |
| generated_ids = generator( |
| gen_in, |
| max_new_tokens=200, |
| use_cache=True, |
| pad_token_id=tokenizer.eos_token_id, |
| num_return_sequences=1, |
| do_sample=True, |
| repetition_penalty=1.1, |
| temperature=0.5, |
| top_k = 50, |
| top_p = 1.0, |
| early_stopping=True, |
| ) |
| generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
| text_without_prompt = generated_text[len(fulltext):] |
|
|
| response = text_without_prompt |
|
|
| response = response.split(human_invitation)[0] |
|
|
| response.strip() |
|
|
| print(invitation + response) |
|
|
| print("") |
|
|
| history.append(invitation + response) |
|
|
| while True: |
| go() |
|
|