gpt2
bigsmall
compression
lossless
wpferrell commited on
Commit
7214371
·
verified ·
1 Parent(s): 4190744

Initial upload: GPT-2 compressed with BigSmall (lossless, 75.53% ratio)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.bs filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - bigsmall
5
+ - compression
6
+ - lossless
7
+ - gpt2
8
+ ---
9
+
10
+ # GPT-2 (BigSmall compressed)
11
+
12
+ This is GPT-2 compressed with [BigSmall](https://github.com/wpferrell/Bigsmall) — lossless neural network weight compression.
13
+
14
+ The weights are **bit-for-bit identical** to the original after decompression. No quality degradation, no accuracy loss.
15
+
16
+ ## Usage
17
+
18
+ ```python
19
+ pip install bigsmall
20
+ ```
21
+
22
+ ```python
23
+ import bigsmall
24
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
25
+
26
+ # Load compressed weights
27
+ state_dict = bigsmall.from_pretrained("wpferrell/gpt2-bigsmall")
28
+
29
+ # Load into model
30
+ model = GPT2LMHeadModel.from_pretrained("gpt2", state_dict=state_dict)
31
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
32
+
33
+ # Run inference - identical to original GPT-2
34
+ inputs = tokenizer("Hello, I'm a language model", return_tensors="pt")
35
+ outputs = model.generate(**inputs, max_new_tokens=20)
36
+ print(tokenizer.decode(outputs[0]))
37
+ ```
38
+
39
+ ## Compression stats
40
+ - Original size: ~548 MB
41
+ - Compressed size: ~414 MB
42
+ - Ratio: 75.53% (lossless)
43
+ - Format: FP32
44
+ - Round-trip verified: 160/160 tensors md5-identical
45
+
46
+ ## About BigSmall
47
+ BigSmall compresses model weights losslessly. Unlike quantization, the decompressed weights are bit-for-bit identical to the originals. Supports BF16, FP16, FP32, FP64, INT8 formats across LLMs and diffusion models.
48
+
49
+ - PyPI: `pip install bigsmall`
50
+ - GitHub: https://github.com/wpferrell/Bigsmall
bigsmall.index.json ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "bigsmall_version": "1.0.0",
4
+ "container_version": 1,
5
+ "format": "fp32",
6
+ "mode": "balanced",
7
+ "model_type": "llm",
8
+ "total_size": 413973499,
9
+ "total_raw_size": 548090880,
10
+ "ratio_pct": 75.53008344163655,
11
+ "shard_count": 1,
12
+ "tensor_count": 160,
13
+ "shards": [
14
+ "model.bs"
15
+ ]
16
+ },
17
+ "weight_map": {
18
+ "h.0.attn.bias": "model.bs",
19
+ "h.0.attn.c_attn.bias": "model.bs",
20
+ "h.0.attn.c_attn.weight": "model.bs",
21
+ "h.0.attn.c_proj.bias": "model.bs",
22
+ "h.0.attn.c_proj.weight": "model.bs",
23
+ "h.0.ln_1.bias": "model.bs",
24
+ "h.0.ln_1.weight": "model.bs",
25
+ "h.0.ln_2.bias": "model.bs",
26
+ "h.0.ln_2.weight": "model.bs",
27
+ "h.0.mlp.c_fc.bias": "model.bs",
28
+ "h.0.mlp.c_fc.weight": "model.bs",
29
+ "h.0.mlp.c_proj.bias": "model.bs",
30
+ "h.0.mlp.c_proj.weight": "model.bs",
31
+ "h.1.attn.bias": "model.bs",
32
+ "h.1.attn.c_attn.bias": "model.bs",
33
+ "h.1.attn.c_attn.weight": "model.bs",
34
+ "h.1.attn.c_proj.bias": "model.bs",
35
+ "h.1.attn.c_proj.weight": "model.bs",
36
+ "h.1.ln_1.bias": "model.bs",
37
+ "h.1.ln_1.weight": "model.bs",
38
+ "h.1.ln_2.bias": "model.bs",
39
+ "h.1.ln_2.weight": "model.bs",
40
+ "h.1.mlp.c_fc.bias": "model.bs",
41
+ "h.1.mlp.c_fc.weight": "model.bs",
42
+ "h.1.mlp.c_proj.bias": "model.bs",
43
+ "h.1.mlp.c_proj.weight": "model.bs",
44
+ "h.10.attn.bias": "model.bs",
45
+ "h.10.attn.c_attn.bias": "model.bs",
46
+ "h.10.attn.c_attn.weight": "model.bs",
47
+ "h.10.attn.c_proj.bias": "model.bs",
48
+ "h.10.attn.c_proj.weight": "model.bs",
49
+ "h.10.ln_1.bias": "model.bs",
50
+ "h.10.ln_1.weight": "model.bs",
51
+ "h.10.ln_2.bias": "model.bs",
52
+ "h.10.ln_2.weight": "model.bs",
53
+ "h.10.mlp.c_fc.bias": "model.bs",
54
+ "h.10.mlp.c_fc.weight": "model.bs",
55
+ "h.10.mlp.c_proj.bias": "model.bs",
56
+ "h.10.mlp.c_proj.weight": "model.bs",
57
+ "h.11.attn.bias": "model.bs",
58
+ "h.11.attn.c_attn.bias": "model.bs",
59
+ "h.11.attn.c_attn.weight": "model.bs",
60
+ "h.11.attn.c_proj.bias": "model.bs",
61
+ "h.11.attn.c_proj.weight": "model.bs",
62
+ "h.11.ln_1.bias": "model.bs",
63
+ "h.11.ln_1.weight": "model.bs",
64
+ "h.11.ln_2.bias": "model.bs",
65
+ "h.11.ln_2.weight": "model.bs",
66
+ "h.11.mlp.c_fc.bias": "model.bs",
67
+ "h.11.mlp.c_fc.weight": "model.bs",
68
+ "h.11.mlp.c_proj.bias": "model.bs",
69
+ "h.11.mlp.c_proj.weight": "model.bs",
70
+ "h.2.attn.bias": "model.bs",
71
+ "h.2.attn.c_attn.bias": "model.bs",
72
+ "h.2.attn.c_attn.weight": "model.bs",
73
+ "h.2.attn.c_proj.bias": "model.bs",
74
+ "h.2.attn.c_proj.weight": "model.bs",
75
+ "h.2.ln_1.bias": "model.bs",
76
+ "h.2.ln_1.weight": "model.bs",
77
+ "h.2.ln_2.bias": "model.bs",
78
+ "h.2.ln_2.weight": "model.bs",
79
+ "h.2.mlp.c_fc.bias": "model.bs",
80
+ "h.2.mlp.c_fc.weight": "model.bs",
81
+ "h.2.mlp.c_proj.bias": "model.bs",
82
+ "h.2.mlp.c_proj.weight": "model.bs",
83
+ "h.3.attn.bias": "model.bs",
84
+ "h.3.attn.c_attn.bias": "model.bs",
85
+ "h.3.attn.c_attn.weight": "model.bs",
86
+ "h.3.attn.c_proj.bias": "model.bs",
87
+ "h.3.attn.c_proj.weight": "model.bs",
88
+ "h.3.ln_1.bias": "model.bs",
89
+ "h.3.ln_1.weight": "model.bs",
90
+ "h.3.ln_2.bias": "model.bs",
91
+ "h.3.ln_2.weight": "model.bs",
92
+ "h.3.mlp.c_fc.bias": "model.bs",
93
+ "h.3.mlp.c_fc.weight": "model.bs",
94
+ "h.3.mlp.c_proj.bias": "model.bs",
95
+ "h.3.mlp.c_proj.weight": "model.bs",
96
+ "h.4.attn.bias": "model.bs",
97
+ "h.4.attn.c_attn.bias": "model.bs",
98
+ "h.4.attn.c_attn.weight": "model.bs",
99
+ "h.4.attn.c_proj.bias": "model.bs",
100
+ "h.4.attn.c_proj.weight": "model.bs",
101
+ "h.4.ln_1.bias": "model.bs",
102
+ "h.4.ln_1.weight": "model.bs",
103
+ "h.4.ln_2.bias": "model.bs",
104
+ "h.4.ln_2.weight": "model.bs",
105
+ "h.4.mlp.c_fc.bias": "model.bs",
106
+ "h.4.mlp.c_fc.weight": "model.bs",
107
+ "h.4.mlp.c_proj.bias": "model.bs",
108
+ "h.4.mlp.c_proj.weight": "model.bs",
109
+ "h.5.attn.bias": "model.bs",
110
+ "h.5.attn.c_attn.bias": "model.bs",
111
+ "h.5.attn.c_attn.weight": "model.bs",
112
+ "h.5.attn.c_proj.bias": "model.bs",
113
+ "h.5.attn.c_proj.weight": "model.bs",
114
+ "h.5.ln_1.bias": "model.bs",
115
+ "h.5.ln_1.weight": "model.bs",
116
+ "h.5.ln_2.bias": "model.bs",
117
+ "h.5.ln_2.weight": "model.bs",
118
+ "h.5.mlp.c_fc.bias": "model.bs",
119
+ "h.5.mlp.c_fc.weight": "model.bs",
120
+ "h.5.mlp.c_proj.bias": "model.bs",
121
+ "h.5.mlp.c_proj.weight": "model.bs",
122
+ "h.6.attn.bias": "model.bs",
123
+ "h.6.attn.c_attn.bias": "model.bs",
124
+ "h.6.attn.c_attn.weight": "model.bs",
125
+ "h.6.attn.c_proj.bias": "model.bs",
126
+ "h.6.attn.c_proj.weight": "model.bs",
127
+ "h.6.ln_1.bias": "model.bs",
128
+ "h.6.ln_1.weight": "model.bs",
129
+ "h.6.ln_2.bias": "model.bs",
130
+ "h.6.ln_2.weight": "model.bs",
131
+ "h.6.mlp.c_fc.bias": "model.bs",
132
+ "h.6.mlp.c_fc.weight": "model.bs",
133
+ "h.6.mlp.c_proj.bias": "model.bs",
134
+ "h.6.mlp.c_proj.weight": "model.bs",
135
+ "h.7.attn.bias": "model.bs",
136
+ "h.7.attn.c_attn.bias": "model.bs",
137
+ "h.7.attn.c_attn.weight": "model.bs",
138
+ "h.7.attn.c_proj.bias": "model.bs",
139
+ "h.7.attn.c_proj.weight": "model.bs",
140
+ "h.7.ln_1.bias": "model.bs",
141
+ "h.7.ln_1.weight": "model.bs",
142
+ "h.7.ln_2.bias": "model.bs",
143
+ "h.7.ln_2.weight": "model.bs",
144
+ "h.7.mlp.c_fc.bias": "model.bs",
145
+ "h.7.mlp.c_fc.weight": "model.bs",
146
+ "h.7.mlp.c_proj.bias": "model.bs",
147
+ "h.7.mlp.c_proj.weight": "model.bs",
148
+ "h.8.attn.bias": "model.bs",
149
+ "h.8.attn.c_attn.bias": "model.bs",
150
+ "h.8.attn.c_attn.weight": "model.bs",
151
+ "h.8.attn.c_proj.bias": "model.bs",
152
+ "h.8.attn.c_proj.weight": "model.bs",
153
+ "h.8.ln_1.bias": "model.bs",
154
+ "h.8.ln_1.weight": "model.bs",
155
+ "h.8.ln_2.bias": "model.bs",
156
+ "h.8.ln_2.weight": "model.bs",
157
+ "h.8.mlp.c_fc.bias": "model.bs",
158
+ "h.8.mlp.c_fc.weight": "model.bs",
159
+ "h.8.mlp.c_proj.bias": "model.bs",
160
+ "h.8.mlp.c_proj.weight": "model.bs",
161
+ "h.9.attn.bias": "model.bs",
162
+ "h.9.attn.c_attn.bias": "model.bs",
163
+ "h.9.attn.c_attn.weight": "model.bs",
164
+ "h.9.attn.c_proj.bias": "model.bs",
165
+ "h.9.attn.c_proj.weight": "model.bs",
166
+ "h.9.ln_1.bias": "model.bs",
167
+ "h.9.ln_1.weight": "model.bs",
168
+ "h.9.ln_2.bias": "model.bs",
169
+ "h.9.ln_2.weight": "model.bs",
170
+ "h.9.mlp.c_fc.bias": "model.bs",
171
+ "h.9.mlp.c_fc.weight": "model.bs",
172
+ "h.9.mlp.c_proj.bias": "model.bs",
173
+ "h.9.mlp.c_proj.weight": "model.bs",
174
+ "ln_f.bias": "model.bs",
175
+ "ln_f.weight": "model.bs",
176
+ "wpe.weight": "model.bs",
177
+ "wte.weight": "model.bs"
178
+ }
179
+ }
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "resid_pdrop": 0.1,
19
+ "summary_activation": null,
20
+ "summary_first_dropout": 0.1,
21
+ "summary_proj_to_labels": true,
22
+ "summary_type": "cls_index",
23
+ "summary_use_proj": true,
24
+ "task_specific_params": {
25
+ "text-generation": {
26
+ "do_sample": true,
27
+ "max_length": 50
28
+ }
29
+ },
30
+ "vocab_size": 50257
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 50256,
3
+ "eos_token_id": 50256,
4
+ "transformers_version": "4.26.0.dev0",
5
+ "_from_model_config": true
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.bs ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1a8e938d66a1b5e046ca50d5334684c2a86aa9948f5f32a1bbe4743df5c89dd
3
+ size 413973499
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 1024}
vocab.json ADDED
The diff for this file is too large to render. See raw diff