File size: 5,839 Bytes
bdf6773
 
 
6b53f65
 
0e24490
6b53f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e24490
6b53f65
 
 
 
0e24490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b53f65
c26ba8d
bb623b4
 
 
 
 
 
 
 
 
 
 
 
3b49286
bb623b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
---
{}
---

Deployment:
```yaml
build_commands: []
external_package_dirs: []
model_metadata: {}
model_name: fp8-baseten/example-Meta-Llama-3-70B-InstructForSequenceClassification
python_version: py39
requirements: []
resources:
  accelerator: H100:1
  cpu: "1"
  memory: 64Gi
  use_gpu: true
secrets:
  hf_access_token: set token in baseten workspace
system_packages: []
trt_llm:
  build:
    base_model: encoder
    # automatically infered from config[max_position_embeddings]
    max_seq_len: 42 
    # max_batch_size per dynamic batch, recommended to stay at 32
    max_batch_size: 32
    # max num tokens per dynamic batch, strongly recommended to keep this number
    max_num_tokens: 16384 
    checkpoint_repository:
      source: HF
      repo: "baseten/example-Meta-Llama-3-70B-InstructForSequenceClassification"
      revision: "main" # hf revision hash
    # `fp8` or `no_quant` (=fp16) are allowed.
    quantization_type: fp8
    num_builder_gpus: 4
```

Usage:
```python
import requests
import os
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2")

prompt = "Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits all her apples equally among herself and her 2 siblings. How many apples does each person get?"
# Positive example, gets high score 0.999 or raw around inv_sig(0.999) ~ 13
response1 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among herself and her 2 siblings (3 people in total). 9 ÷ 3 = 3 apples each. Each person gets 3 apples."
# negative example, gets low score ~0.001 or raw around inv_sig(0.001) ~ -9
response2 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among her 2 siblings (2 people in total). 9 ÷ 2 = 4.5 apples each. Each person gets 4 apples."

# predict api: {
#   "inputs": "What is Deep Learning?", # str, may be formatted with chat template.
#   "raw_scores": false, # with or without sigmoid activation
#   "truncate": false,
#   "truncation_direction": "right"
# }

for assistant_response in [response1, response2]:
    # Feel free to parallelize this, requests will be batched in the backend.
    
    conv = [{"role": "user", "content": prompt}, {"role": "assistant", "content": assistant_response}]
    conv_formatted = tokenizer.apply_chat_template(conv, tokenize=False)
    input_json = dict(inputs=conv_formatted, raw_scores=True)
    resp = requests.post(
        "https://model-xxxxxx.api.baseten.co/environments/production/sync/predict",
        headers={"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"},
        json=input_json,
    )

    print(resp.json())
    # prints
    # [{'score': 13.714337, 'label': 'LABEL_0'}]
    # [{'score': -9.353895, 'label': 'LABEL_0'}]
```

Reproduce this model:
```python
#!/usr/bin/env python
import torch
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    LlamaForSequenceClassification,
)
# install torch, transformers, accelerate

def main():
    # Define the input and output repository names.
    input_model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
    split_2 = input_model_id.split("/")[1]
    output_model_id = f"baseten/example-{split_2}ForSequenceClassification"  

    # Load the original configuration.
    # (If needed, add trust_remote_code=True for custom implementations.)
    config = AutoConfig.from_pretrained(input_model_id)
    
    # Update the config for a sequence classification task with 10 labels.
    num_labels = 30
    config.num_labels = num_labels
    config.id2label = {i: f"token activation {i}" for i in range(num_labels)}
    config.label2id = {f"token activation {i}": i for i in range(num_labels)}

    # Download the tokenizer from the original model.
    tokenizer = AutoTokenizer.from_pretrained(input_model_id)

    # Load the original causal LM model.
    lm_model = AutoModelForCausalLM.from_pretrained(input_model_id, config=config, device_map="auto", low_cpu_mem_usage=True)
    config.architectures = ["LlamaForSequenceClassification"]
    del lm_model.model
    print("loaded lm model")
    # Initialize the sequence classification model.
    # NOTE: We are using the built-in LlamaForSequenceClassification,
    # which uses a `.score` attribute as the output head.
    seq_cls_model = LlamaForSequenceClassification.from_pretrained(input_model_id, config=config, device_map="auto", low_cpu_mem_usage=True)

    # --- Initialize the Classification Head ---
    # Here we re-use the first 10 rows from the original LM head
    # (i.e. rows 0 to 9) to initialize the new classification head.
    with torch.no_grad():
        # lm_model.lm_head.weight has shape [vocab_size, hidden_size]
        # We take the first 10 rows to form a [10, hidden_size] weight matrix.
        seq_cls_model.score.weight.copy_(lm_model.lm_head.weight.data[:num_labels, :])
        if lm_model.lm_head.bias is not None:
            seq_cls_model.score.bias.copy_(lm_model.lm_head.bias.data[:num_labels])

    # Optionally, save the new model locally.
    # save_directory = f"./{output_model_id.replace('/','_')}"
    # seq_cls_model.save_pretrained(save_directory)
    # tokenizer.save_pretrained(save_directory)

    # Push the new model and tokenizer to the Hub.
    # (Ensure you are authenticated with Hugging Face Hub via `huggingface-cli login`.)
    tokenizer.push_to_hub(output_model_id)
    seq_cls_model.push_to_hub(output_model_id)
    

    print(f"New model pushed to the Hub: {output_model_id}")

if __name__ == "__main__":
    main()
```