YongganFu commited on
Commit
030b010
·
verified ·
1 Parent(s): 7c1ba37

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +57 -3
README.md CHANGED
@@ -62,7 +62,7 @@ history = []
62
  user_input = input("User: ").strip()
63
  history.append({"role": "user", "content": user_input})
64
 
65
- prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
66
  prompt_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device='cuda')
67
  out_ids, nfe = model.ar_generate(inputs.input_ids, max_new_tokens=512)
68
 
@@ -72,7 +72,7 @@ print(f"[Num Function Eval (NFE)={nfe}]")
72
  ```
73
 
74
 
75
- ## Chat with Our Model in Self-Speculation Mode
76
 
77
  ```
78
  from transformers import AutoModel, AutoTokenizer, AutoConfig
@@ -92,7 +92,7 @@ history = []
92
  user_input = input("User: ").strip()
93
  history.append({"role": "user", "content": user_input})
94
 
95
- prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
96
 
97
  inputs = tokenizer(prompt, return_tensors="pt")
98
  inputs = inputs.to("cuda")
@@ -104,3 +104,57 @@ tokenized_out = tokenizer.batch_decode(out_ids[:, inputs.input_ids.shape[1]:], s
104
  print(f"Model: {tokenized_out}")
105
  print(f"[Num Function Eval (NFE)={nfe}]")
106
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  user_input = input("User: ").strip()
63
  history.append({"role": "user", "content": user_input})
64
 
65
+ prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True, enable_thinking=False)
66
  prompt_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device='cuda')
67
  out_ids, nfe = model.ar_generate(inputs.input_ids, max_new_tokens=512)
68
 
 
72
  ```
73
 
74
 
75
+ ## Chat with Our Model in Quadratic Self-Speculation Mode
76
 
77
  ```
78
  from transformers import AutoModel, AutoTokenizer, AutoConfig
 
92
  user_input = input("User: ").strip()
93
  history.append({"role": "user", "content": user_input})
94
 
95
+ prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True, enable_thinking=False)
96
 
97
  inputs = tokenizer(prompt, return_tensors="pt")
98
  inputs = inputs.to("cuda")
 
104
  print(f"Model: {tokenized_out}")
105
  print(f"[Num Function Eval (NFE)={nfe}]")
106
  ```
107
+
108
+ ## Chat with Our Model in Linear Self-Speculation Mode
109
+
110
+ ```
111
+ from transformers import AutoModel, AutoTokenizer
112
+ import torch
113
+
114
+ repo_name = "nvidia/Nemotron-Diffusion-Exp-Ministral-3B-Instruct"
115
+
116
+ tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
117
+ model = AutoModel.from_pretrained(repo_name, trust_remote_code=True)
118
+ model = model.cuda().to(torch.bfloat16)
119
+
120
+ history = []
121
+
122
+ user_input = input("User: ").strip()
123
+ history.append({"role": "user", "content": user_input})
124
+
125
+ prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True, enable_thinking=False)
126
+ prompt_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device='cuda')
127
+ out_ids, nfe = model.linear_spec_generate(prompt_ids, max_new_tokens=512, block_length=32, eos_token_id=tokenizer.eos_token_id)
128
+
129
+ tokenized_out = tokenizer.batch_decode(out_ids[:, prompt_ids.shape[1]:], skip_special_tokens=True)[0]
130
+ print(f"Model: {tokenized_out}")
131
+ print(f"[Num Function Eval (NFE)={nfe}]")
132
+ ```
133
+
134
+
135
+ ## Chat with Our Model in Linear Decoding Mode with Multi-Path Verification
136
+
137
+ ```
138
+ from transformers import AutoModel, AutoTokenizer
139
+ import torch
140
+
141
+ repo_name = "nvidia/Nemotron-Diffusion-Exp-Ministral-3B-Instruct"
142
+
143
+ tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
144
+ model = AutoModel.from_pretrained(repo_name, trust_remote_code=True)
145
+ model = model.cuda().to(torch.bfloat16)
146
+
147
+ history = []
148
+
149
+ user_input = input("User: ").strip()
150
+ history.append({"role": "user", "content": user_input})
151
+
152
+ prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True, enable_thinking=False)
153
+ prompt_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device='cuda')
154
+ out_ids, nfe = model.linear_spec_generate_mp(prompt_ids, max_new_tokens=512, block_length=32, eos_token_id=tokenizer.eos_token_id)
155
+
156
+ tokenized_out = tokenizer.batch_decode(out_ids[:, prompt_ids.shape[1]:], skip_special_tokens=True)[0]
157
+ print(f"Model: {tokenized_out}")
158
+ print(f"[Num Function Eval (NFE)={nfe}]")
159
+ ```
160
+