Spaces:

Muhammadidrees
/

MoizChatDoctor

Paused

App Files Files Community

MoizChatDoctor / app.py

Muhammadidrees

Update app.py

d729467 verified 6 months ago

raw

history blame contribute delete

2.91 kB

	import os, json, itertools, bisect, gc

	from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
	import transformers
	import torch
	from accelerate import Accelerator
	import accelerate
	import time
	from huggingface_hub import login



	# 🟢 (Option 2 - recommended for Render/Colab)
	# Save your token as an environment variable called HUGGINGFACE_HUB_TOKEN
	# Then this will automatically pick it up:
	hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")

	model = None
	tokenizer = None
	generator = None

	def load_model(model_name, eight_bit=0, device_map="auto"):
	global model, tokenizer, generator

	print("Loading "+model_name+"...")

	if device_map == "zero":
	device_map = "balanced_low_0"

	# config
	gpu_count = torch.cuda.device_count()
	print('gpu_count', gpu_count)

	tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, token=hf_token)
	model = transformers.LlamaForCausalLM.from_pretrained(
	model_name,
	#device_map=device_map,
	#device_map="auto",
	torch_dtype=torch.float16,
	#max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"},
	#load_in_8bit=eight_bit,
	low_cpu_mem_usage=True,
	load_in_8bit=False,
	cache_dir="cache",
	token=hf_token
	).cuda()

	generator = model.generate

	load_model("Muhammadidrees/JayConverstionalModel")

	history = []

	def go():
	invitation = "Assistant: "
	human_invitation = "Human: "

	# input
	msg = input(human_invitation)
	print("")

	history.append(human_invitation + msg)

	fulltext = "\n\n".join(history) + "\n\n" + invitation

	# print('SENDING==========')
	# print(fulltext)
	# print('==========')

	generated_text = ""
	gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda()
	in_tokens = len(gen_in)
	with torch.no_grad():
	generated_ids = generator(
	gen_in,
	max_new_tokens=200,
	use_cache=True,
	pad_token_id=tokenizer.eos_token_id,
	num_return_sequences=1,
	do_sample=True,
	repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
	temperature=0.5, # default: 1.0
	top_k = 50, # default: 50
	top_p = 1.0, # default: 1.0
	early_stopping=True,
	)
	generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element?

	text_without_prompt = generated_text[len(fulltext):]

	response = text_without_prompt

	response = response.split(human_invitation)[0]

	response.strip()

	print(invitation + response)

	print("")

	history.append(invitation + response)

	while True:
	go()