File size: 5,839 Bytes
bdf6773 6b53f65 0e24490 6b53f65 0e24490 6b53f65 0e24490 6b53f65 c26ba8d bb623b4 3b49286 bb623b4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | ---
{}
---
Deployment:
```yaml
build_commands: []
external_package_dirs: []
model_metadata: {}
model_name: fp8-baseten/example-Meta-Llama-3-70B-InstructForSequenceClassification
python_version: py39
requirements: []
resources:
accelerator: H100:1
cpu: "1"
memory: 64Gi
use_gpu: true
secrets:
hf_access_token: set token in baseten workspace
system_packages: []
trt_llm:
build:
base_model: encoder
# automatically infered from config[max_position_embeddings]
max_seq_len: 42
# max_batch_size per dynamic batch, recommended to stay at 32
max_batch_size: 32
# max num tokens per dynamic batch, strongly recommended to keep this number
max_num_tokens: 16384
checkpoint_repository:
source: HF
repo: "baseten/example-Meta-Llama-3-70B-InstructForSequenceClassification"
revision: "main" # hf revision hash
# `fp8` or `no_quant` (=fp16) are allowed.
quantization_type: fp8
num_builder_gpus: 4
```
Usage:
```python
import requests
import os
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2")
prompt = "Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits all her apples equally among herself and her 2 siblings. How many apples does each person get?"
# Positive example, gets high score 0.999 or raw around inv_sig(0.999) ~ 13
response1 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among herself and her 2 siblings (3 people in total). 9 ÷ 3 = 3 apples each. Each person gets 3 apples."
# negative example, gets low score ~0.001 or raw around inv_sig(0.001) ~ -9
response2 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among her 2 siblings (2 people in total). 9 ÷ 2 = 4.5 apples each. Each person gets 4 apples."
# predict api: {
# "inputs": "What is Deep Learning?", # str, may be formatted with chat template.
# "raw_scores": false, # with or without sigmoid activation
# "truncate": false,
# "truncation_direction": "right"
# }
for assistant_response in [response1, response2]:
# Feel free to parallelize this, requests will be batched in the backend.
conv = [{"role": "user", "content": prompt}, {"role": "assistant", "content": assistant_response}]
conv_formatted = tokenizer.apply_chat_template(conv, tokenize=False)
input_json = dict(inputs=conv_formatted, raw_scores=True)
resp = requests.post(
"https://model-xxxxxx.api.baseten.co/environments/production/sync/predict",
headers={"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"},
json=input_json,
)
print(resp.json())
# prints
# [{'score': 13.714337, 'label': 'LABEL_0'}]
# [{'score': -9.353895, 'label': 'LABEL_0'}]
```
Reproduce this model:
```python
#!/usr/bin/env python
import torch
from transformers import (
AutoConfig,
AutoTokenizer,
AutoModelForCausalLM,
LlamaForSequenceClassification,
)
# install torch, transformers, accelerate
def main():
# Define the input and output repository names.
input_model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
split_2 = input_model_id.split("/")[1]
output_model_id = f"baseten/example-{split_2}ForSequenceClassification"
# Load the original configuration.
# (If needed, add trust_remote_code=True for custom implementations.)
config = AutoConfig.from_pretrained(input_model_id)
# Update the config for a sequence classification task with 10 labels.
num_labels = 30
config.num_labels = num_labels
config.id2label = {i: f"token activation {i}" for i in range(num_labels)}
config.label2id = {f"token activation {i}": i for i in range(num_labels)}
# Download the tokenizer from the original model.
tokenizer = AutoTokenizer.from_pretrained(input_model_id)
# Load the original causal LM model.
lm_model = AutoModelForCausalLM.from_pretrained(input_model_id, config=config, device_map="auto", low_cpu_mem_usage=True)
config.architectures = ["LlamaForSequenceClassification"]
del lm_model.model
print("loaded lm model")
# Initialize the sequence classification model.
# NOTE: We are using the built-in LlamaForSequenceClassification,
# which uses a `.score` attribute as the output head.
seq_cls_model = LlamaForSequenceClassification.from_pretrained(input_model_id, config=config, device_map="auto", low_cpu_mem_usage=True)
# --- Initialize the Classification Head ---
# Here we re-use the first 10 rows from the original LM head
# (i.e. rows 0 to 9) to initialize the new classification head.
with torch.no_grad():
# lm_model.lm_head.weight has shape [vocab_size, hidden_size]
# We take the first 10 rows to form a [10, hidden_size] weight matrix.
seq_cls_model.score.weight.copy_(lm_model.lm_head.weight.data[:num_labels, :])
if lm_model.lm_head.bias is not None:
seq_cls_model.score.bias.copy_(lm_model.lm_head.bias.data[:num_labels])
# Optionally, save the new model locally.
# save_directory = f"./{output_model_id.replace('/','_')}"
# seq_cls_model.save_pretrained(save_directory)
# tokenizer.save_pretrained(save_directory)
# Push the new model and tokenizer to the Hub.
# (Ensure you are authenticated with Hugging Face Hub via `huggingface-cli login`.)
tokenizer.push_to_hub(output_model_id)
seq_cls_model.push_to_hub(output_model_id)
print(f"New model pushed to the Hub: {output_model_id}")
if __name__ == "__main__":
main()
``` |