Kevin Knoedler commited on
Commit
59b2bdd
·
0 Parent(s):

Initial commit

Browse files
Files changed (7) hide show
  1. .gitattributes +35 -0
  2. LICENSE +201 -0
  3. README.md +57 -0
  4. index.html +608 -0
  5. onnx/meta.json +1 -0
  6. onnx/vae_stream.onnx +3 -0
  7. server.py +38 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vox Upscaler
3
+ emoji: 🔊
4
+ colorFrom: red
5
+ colorTo: yellow
6
+ sdk: static
7
+ pinned: false
8
+ license: apache-2.0
9
+ short_description: 16kHz → 48kHz audio upscaling in your browser (WebGPU/CPU)
10
+ models:
11
+ - openbmb/VoxCPM2
12
+ custom_headers:
13
+ cross-origin-embedder-policy: require-corp
14
+ cross-origin-opener-policy: same-origin
15
+ cross-origin-resource-policy: cross-origin
16
+ ---
17
+
18
+ # Vox Upscaler
19
+
20
+ Browser-based 16kHz → 48kHz audio upscaling using the VoxCPM2 streaming VAE. Runs with WebGPU when available, falls back to CPU (WASM).
21
+
22
+ ## Usage
23
+
24
+ Drop in an audio file, click **Upscale to 48 kHz**, download the result.
25
+
26
+ ## How it works
27
+
28
+ The VoxCPM2 VAE encodes audio to a latent space and decodes at 48kHz. The ONNX model processes audio in streaming chunks with explicit state passing — no autoregressive loop, just a single forward pass per chunk.
29
+
30
+ - **WebGPU**: fp32 model, 5s chunks
31
+ - **CPU (WASM)**: fp132 model, 1s chunks, multi-threaded
32
+
33
+ ## Files
34
+
35
+ ```
36
+ vox-upscaler-web/
37
+ ├── index.html # Web UI (self-contained)
38
+ ├── server.py # Dev server with COOP/COEP headers
39
+ ├── LICENSE # License
40
+ └── onnx/
41
+ ├── vae_stream.onnx # Streaming VAE (fp32, ~376 MB)
42
+ ├── meta.json # State sizes for runtime
43
+ ```
44
+
45
+ ## Running locally
46
+
47
+ ```bash
48
+ python server.py 8080
49
+ # open http://localhost:8080/
50
+ ```
51
+
52
+ The COOP/COEP headers are required for SharedArrayBuffer (WASM multi-threading).
53
+
54
+ ## Credits
55
+
56
+ - **VAE model**: [VoxCPM2](https://huggingface.co/openbmb/VoxCPM2/blob/main/audiovae.pth) by [OpenBMB](https://huggingface.co/openbmb) — Apache-2.0
57
+ - **WebGPU port**: [KevinAHM](https://huggingface.co/KevinAHM)
index.html ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Vox Upscaler</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link href="https://fonts.googleapis.com/css2?family=DM+Mono:wght@400;500&family=Instrument+Sans:wght@400;600;700&display=swap" rel="stylesheet">
9
+ <style>
10
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
11
+
12
+ :root {
13
+ --bg: #0a0a0f;
14
+ --surface: #13131a;
15
+ --border: #1e1e2a;
16
+ --text: #e8e8ed;
17
+ --text-dim: #6b6b7b;
18
+ --accent: #ff6b35;
19
+ --accent-glow: rgba(255, 107, 53, 0.15);
20
+ --green: #34d399;
21
+ --yellow: #fbbf24;
22
+ --red: #f87171;
23
+ --font-body: 'Instrument Sans', sans-serif;
24
+ --font-mono: 'DM Mono', monospace;
25
+ }
26
+
27
+ body {
28
+ background: var(--bg);
29
+ color: var(--text);
30
+ font-family: var(--font-body);
31
+ min-height: 100vh;
32
+ display: flex;
33
+ align-items: center;
34
+ justify-content: center;
35
+ }
36
+
37
+ .container {
38
+ width: 100%;
39
+ max-width: 520px;
40
+ padding: 2rem;
41
+ }
42
+
43
+ h1 {
44
+ font-size: 1.1rem;
45
+ font-weight: 700;
46
+ letter-spacing: 0.08em;
47
+ text-transform: uppercase;
48
+ margin-bottom: 2rem;
49
+ display: flex;
50
+ align-items: center;
51
+ gap: 0.6rem;
52
+ }
53
+
54
+ h1 .dot {
55
+ width: 8px; height: 8px;
56
+ border-radius: 50%;
57
+ background: var(--accent);
58
+ box-shadow: 0 0 12px var(--accent);
59
+ }
60
+
61
+ .drop-zone {
62
+ border: 2px dashed var(--border);
63
+ border-radius: 12px;
64
+ padding: 3rem 2rem;
65
+ text-align: center;
66
+ cursor: pointer;
67
+ transition: border-color 0.2s, background 0.2s;
68
+ position: relative;
69
+ }
70
+
71
+ .drop-zone:hover, .drop-zone.dragover {
72
+ border-color: var(--accent);
73
+ background: var(--accent-glow);
74
+ }
75
+
76
+ .drop-zone.has-file {
77
+ border-style: solid;
78
+ border-color: var(--border);
79
+ padding: 1.2rem 1.5rem;
80
+ text-align: left;
81
+ }
82
+
83
+ .drop-zone label {
84
+ font-size: 0.85rem;
85
+ color: var(--text-dim);
86
+ display: block;
87
+ cursor: pointer;
88
+ }
89
+
90
+ .drop-zone .filename {
91
+ font-family: var(--font-mono);
92
+ font-size: 0.9rem;
93
+ margin-top: 0.3rem;
94
+ color: var(--text);
95
+ }
96
+
97
+ .drop-zone input { display: none; }
98
+
99
+ .status-bar {
100
+ display: flex;
101
+ gap: 1.2rem;
102
+ margin-top: 1.2rem;
103
+ font-family: var(--font-mono);
104
+ font-size: 0.75rem;
105
+ color: var(--text-dim);
106
+ }
107
+
108
+ .status-bar .chip {
109
+ display: flex;
110
+ align-items: center;
111
+ gap: 0.4rem;
112
+ background: var(--surface);
113
+ border: 1px solid var(--border);
114
+ border-radius: 6px;
115
+ padding: 0.35rem 0.7rem;
116
+ }
117
+
118
+ .chip .indicator {
119
+ width: 6px; height: 6px;
120
+ border-radius: 50%;
121
+ background: var(--text-dim);
122
+ }
123
+
124
+ .chip .indicator.gpu { background: var(--green); box-shadow: 0 0 6px var(--green); }
125
+ .chip .indicator.cpu { background: var(--yellow); box-shadow: 0 0 6px var(--yellow); }
126
+
127
+ button#process {
128
+ width: 100%;
129
+ margin-top: 1.5rem;
130
+ padding: 0.9rem;
131
+ border: none;
132
+ border-radius: 10px;
133
+ background: var(--accent);
134
+ color: #fff;
135
+ font-family: var(--font-body);
136
+ font-size: 0.9rem;
137
+ font-weight: 600;
138
+ cursor: pointer;
139
+ transition: opacity 0.2s, transform 0.1s;
140
+ }
141
+
142
+ button#process:hover { opacity: 0.9; }
143
+ button#process:active { transform: scale(0.98); }
144
+ button#process:disabled { opacity: 0.4; cursor: not-allowed; transform: none; }
145
+
146
+ .progress-wrap {
147
+ margin-top: 1.5rem;
148
+ display: none;
149
+ }
150
+
151
+ .progress-wrap.active { display: block; }
152
+
153
+ .progress-bar-bg {
154
+ width: 100%;
155
+ height: 4px;
156
+ background: var(--surface);
157
+ border-radius: 2px;
158
+ overflow: hidden;
159
+ }
160
+
161
+ .progress-bar {
162
+ height: 100%;
163
+ width: 0%;
164
+ background: var(--accent);
165
+ border-radius: 2px;
166
+ transition: width 0.15s;
167
+ }
168
+
169
+ .progress-info {
170
+ display: flex;
171
+ justify-content: space-between;
172
+ margin-top: 0.6rem;
173
+ font-family: var(--font-mono);
174
+ font-size: 0.75rem;
175
+ color: var(--text-dim);
176
+ }
177
+
178
+ .ab-section {
179
+ margin-top: 1.5rem;
180
+ display: flex;
181
+ flex-direction: column;
182
+ gap: 1rem;
183
+ }
184
+
185
+ .ab-player {
186
+ background: var(--surface);
187
+ border: 1px solid var(--border);
188
+ border-radius: 10px;
189
+ padding: 1rem;
190
+ }
191
+
192
+ .ab-label {
193
+ font-family: var(--font-mono);
194
+ font-size: 0.8rem;
195
+ font-weight: 500;
196
+ margin-bottom: 0.5rem;
197
+ color: var(--text);
198
+ }
199
+
200
+ .ab-label .ab-sr {
201
+ color: var(--text-dim);
202
+ font-weight: 400;
203
+ }
204
+
205
+ .ab-player audio {
206
+ width: 100%;
207
+ border-radius: 6px;
208
+ }
209
+
210
+ .ab-player a {
211
+ display: inline-block;
212
+ margin-top: 0.6rem;
213
+ font-family: var(--font-mono);
214
+ font-size: 0.8rem;
215
+ color: var(--accent);
216
+ text-decoration: none;
217
+ }
218
+
219
+ .ab-player a:hover { text-decoration: underline; }
220
+
221
+ .model-loading {
222
+ margin-top: 1rem;
223
+ font-family: var(--font-mono);
224
+ font-size: 0.75rem;
225
+ color: var(--text-dim);
226
+ display: none;
227
+ }
228
+
229
+ .model-loading.active { display: block; }
230
+
231
+ footer {
232
+ margin-top: 3rem;
233
+ padding-top: 1.2rem;
234
+ border-top: 1px solid var(--border);
235
+ font-family: var(--font-mono);
236
+ font-size: 0.7rem;
237
+ color: var(--text-dim);
238
+ display: flex;
239
+ flex-direction: column;
240
+ gap: 0.3rem;
241
+ }
242
+
243
+ footer a {
244
+ color: var(--text-dim);
245
+ text-decoration: none;
246
+ border-bottom: 1px dotted var(--text-dim);
247
+ }
248
+
249
+ footer a:hover { color: var(--text); border-color: var(--text); }
250
+
251
+ @keyframes pulse { 0%,100% { opacity: 1; } 50% { opacity: 0.4; } }
252
+ .pulsing { animation: pulse 1.5s ease-in-out infinite; }
253
+ </style>
254
+ </head>
255
+ <body>
256
+ <div class="container">
257
+ <h1><span class="dot"></span>Vox Upscaler</h1>
258
+
259
+ <div class="drop-zone" id="dropZone">
260
+ <label>Drop an audio file or click to browse</label>
261
+ <div class="filename" id="fileName" style="display:none"></div>
262
+ <input type="file" id="fileInput" accept="audio/*">
263
+ </div>
264
+
265
+ <div class="status-bar">
266
+ <div class="chip"><span class="indicator" id="backendDot"></span><span id="backendLabel">detecting…</span></div>
267
+ <div class="chip" id="rtfChip" style="display:none">RTFx: <span id="rtfValue">—</span></div>
268
+ <div class="chip" id="modelChip"><span id="modelStatus">model not loaded</span></div>
269
+ </div>
270
+
271
+ <div class="model-loading" id="modelLoading"></div>
272
+
273
+ <button id="process" disabled>Upscale to 48 kHz</button>
274
+
275
+ <div class="progress-wrap" id="progressWrap">
276
+ <div class="progress-bar-bg"><div class="progress-bar" id="progressBar"></div></div>
277
+ <div class="progress-info">
278
+ <span id="progressLabel">Processing…</span>
279
+ <span id="progressPct">0%</span>
280
+ </div>
281
+ </div>
282
+
283
+ <div class="ab-section" id="abSection" style="display:none">
284
+ <div class="ab-player">
285
+ <div class="ab-label">Input<span class="ab-sr" id="inputSrLabel"></span></div>
286
+ <audio controls id="inputPlayer"></audio>
287
+ </div>
288
+ <div class="ab-player" id="outputPanel" style="display:none">
289
+ <div class="ab-label">Output<span class="ab-sr"> — 48 kHz</span></div>
290
+ <audio controls id="audioPlayer"></audio>
291
+ <a id="downloadLink" download>Download WAV</a>
292
+ </div>
293
+ </div>
294
+
295
+ <footer>
296
+ <span>VAE model: <a href="https://huggingface.co/openbmb/VoxCPM2/blob/main/audiovae.pth" target="_blank">VoxCPM2</a> by <a href="https://huggingface.co/openbmb" target="_blank">OpenBMB</a> · Apache-2.0</span>
297
+ <span>WebGPU port by <a href="https://huggingface.co/KevinAHM" target="_blank">KevinAHM</a></span>
298
+ </footer>
299
+ </div>
300
+
301
+ <script>
302
+ const HOP = 640;
303
+ const TARGET_SR = 48000;
304
+ const INPUT_SR = 16000;
305
+ const META_URL = 'onnx/meta.json';
306
+ const MODEL_URL = 'onnx/vae_stream.onnx';
307
+
308
+ let session = null;
309
+ let meta = null;
310
+ let backend = null; // 'webgpu' or 'cpu'
311
+ let fileBuffer = null;
312
+ let fileName = '';
313
+
314
+ const dropZone = document.getElementById('dropZone');
315
+ const fileInput = document.getElementById('fileInput');
316
+ const fileNameEl = document.getElementById('fileName');
317
+ const processBtn = document.getElementById('process');
318
+ const progressWrap = document.getElementById('progressWrap');
319
+ const progressBar = document.getElementById('progressBar');
320
+ const progressLabel = document.getElementById('progressLabel');
321
+ const progressPct = document.getElementById('progressPct');
322
+ const audioPlayer = document.getElementById('audioPlayer');
323
+ const downloadLink = document.getElementById('downloadLink');
324
+ const backendDot = document.getElementById('backendDot');
325
+ const backendLabel = document.getElementById('backendLabel');
326
+ const rtfChip = document.getElementById('rtfChip');
327
+ const rtfValue = document.getElementById('rtfValue');
328
+ const modelStatus = document.getElementById('modelStatus');
329
+ const modelLoading = document.getElementById('modelLoading');
330
+
331
+ // -- File handling --
332
+ dropZone.addEventListener('click', () => fileInput.click());
333
+ dropZone.addEventListener('dragover', e => { e.preventDefault(); dropZone.classList.add('dragover'); });
334
+ dropZone.addEventListener('dragleave', () => dropZone.classList.remove('dragover'));
335
+ dropZone.addEventListener('drop', e => { e.preventDefault(); dropZone.classList.remove('dragover'); handleFile(e.dataTransfer.files[0]); });
336
+ fileInput.addEventListener('change', () => { if (fileInput.files[0]) handleFile(fileInput.files[0]); });
337
+
338
+ function handleFile(file) {
339
+ fileName = file.name;
340
+ fileNameEl.textContent = file.name;
341
+ fileNameEl.style.display = 'block';
342
+ dropZone.classList.add('has-file');
343
+ dropZone.querySelector('label').textContent = 'Selected file';
344
+
345
+ // Show input player with original file
346
+ document.getElementById('inputPlayer').src = URL.createObjectURL(file);
347
+ document.getElementById('abSection').style.display = 'flex';
348
+ document.getElementById('outputPanel').style.display = 'none';
349
+
350
+ // Read sample rate from file header if WAV
351
+ file.arrayBuffer().then(buf => {
352
+ fileBuffer = buf;
353
+ const view = new DataView(buf);
354
+ let srText = 'Original';
355
+ // WAV: bytes 24-27 = sample rate (little-endian uint32)
356
+ if (buf.byteLength > 28) {
357
+ const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3));
358
+ if (riff === 'RIFF') {
359
+ const sr = view.getUint32(24, true);
360
+ srText = (sr / 1000) + ' kHz';
361
+ }
362
+ }
363
+ document.getElementById('inputSrLabel').textContent = ' — ' + srText;
364
+ updateBtn();
365
+ });
366
+ }
367
+
368
+ function updateBtn() { processBtn.disabled = !(fileBuffer && session); }
369
+
370
+ // -- Detect backend & load model --
371
+ async function init() {
372
+ // Detect WebGPU and patch device creation to raise storage buffer limits
373
+ if (navigator.gpu) {
374
+ try {
375
+ const adapter = await navigator.gpu.requestAdapter();
376
+ if (adapter) {
377
+ // Patch requestDevice to raise limits ORT doesn't request itself
378
+ const origRequestDevice = GPUAdapter.prototype.requestDevice;
379
+ const adapterLimits = adapter.limits;
380
+ GPUAdapter.prototype.requestDevice = function(desc) {
381
+ desc = desc || {};
382
+ desc.requiredLimits = desc.requiredLimits || {};
383
+ const rl = desc.requiredLimits;
384
+ rl.maxStorageBuffersPerShaderStage = adapterLimits.maxStorageBuffersPerShaderStage;
385
+ rl.maxBufferSize = adapterLimits.maxBufferSize;
386
+ rl.maxStorageBufferBindingSize = adapterLimits.maxStorageBufferBindingSize;
387
+ console.log(`[VoxUpscaler] patched requestDevice:`, JSON.stringify(rl));
388
+ return origRequestDevice.call(this, desc);
389
+ };
390
+ backend = 'webgpu';
391
+ backendDot.className = 'indicator gpu';
392
+ backendLabel.textContent = 'WebGPU';
393
+ }
394
+ } catch(e) {}
395
+ }
396
+ if (!backend) {
397
+ backend = 'cpu';
398
+ backendDot.className = 'indicator cpu';
399
+ backendLabel.textContent = 'CPU (WASM)';
400
+ }
401
+
402
+ // Load meta
403
+ modelLoading.classList.add('active');
404
+ modelLoading.innerHTML = '<span class="pulsing">Loading model metadata…</span>';
405
+ const resp = await fetch(META_URL);
406
+ meta = await resp.json();
407
+
408
+ // Load ONNX model
409
+ modelLoading.innerHTML = '<span class="pulsing">Loading ONNX model (fp32, ~376 MB)…</span>';
410
+ modelStatus.textContent = 'loading…';
411
+
412
+ const ep = backend === 'webgpu' ? 'webgpu' : 'wasm';
413
+ const opts = { executionProviders: [ep] };
414
+ if (ep === 'wasm') {
415
+ opts.executionProviders = [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }];
416
+ }
417
+
418
+ try {
419
+ // Fetch as ArrayBuffer to avoid ORT Web external data issues
420
+ const modelResp = await fetch(MODEL_URL);
421
+ const modelBuf = await modelResp.arrayBuffer();
422
+ session = await ort.InferenceSession.create(modelBuf, opts);
423
+ modelStatus.textContent = 'ready';
424
+ modelLoading.innerHTML = '✓ Model loaded';
425
+ modelLoading.classList.remove('active');
426
+ setTimeout(() => { modelLoading.style.display = 'none'; }, 1500);
427
+ } catch(e) {
428
+ // Fallback to wasm if webgpu fails
429
+ if (backend === 'webgpu') {
430
+ backend = 'cpu';
431
+ backendDot.className = 'indicator cpu';
432
+ backendLabel.textContent = 'CPU (WASM)';
433
+ modelLoading.innerHTML = '<span class="pulsing">WebGPU failed, falling back to CPU (fp32)…</span>';
434
+ session = await ort.InferenceSession.create(MODEL_URL, {
435
+ executionProviders: [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }]
436
+ });
437
+ modelStatus.textContent = 'ready';
438
+ modelLoading.innerHTML = '✓ Model loaded (CPU fallback)';
439
+ } else {
440
+ modelLoading.innerHTML = 'Failed to load model: ' + e.message;
441
+ modelStatus.textContent = 'error';
442
+ return;
443
+ }
444
+ }
445
+ updateBtn();
446
+ }
447
+
448
+ // -- Decode audio to mono 16kHz Float32 --
449
+ async function decodeToMono16k(arrayBuffer) {
450
+ const audioCtx = new OfflineAudioContext(1, 1, INPUT_SR);
451
+ const decoded = await audioCtx.decodeAudioData(arrayBuffer.slice(0));
452
+ const origSr = decoded.sampleRate;
453
+ const origData = decoded.getChannelData(0);
454
+
455
+ // Resample to 16kHz
456
+ const ratio = INPUT_SR / origSr;
457
+ const outLen = Math.round(origData.length * ratio);
458
+ const ctx2 = new OfflineAudioContext(1, outLen, INPUT_SR);
459
+ const src = ctx2.createBufferSource();
460
+ const buf = ctx2.createBuffer(1, origData.length, origSr);
461
+ buf.getChannelData(0).set(origData);
462
+ src.buffer = buf;
463
+ src.connect(ctx2.destination);
464
+ src.start();
465
+ const rendered = await ctx2.startRendering();
466
+ return rendered.getChannelData(0);
467
+ }
468
+
469
+ // -- Process --
470
+ processBtn.addEventListener('click', async () => {
471
+ if (!fileBuffer || !session) return;
472
+ processBtn.disabled = true;
473
+ document.getElementById('outputPanel').style.display = 'none';
474
+ progressWrap.classList.add('active');
475
+ progressBar.style.width = '0%';
476
+ progressPct.textContent = '0%';
477
+ progressLabel.textContent = 'Decoding input…';
478
+
479
+ const audio16k = await decodeToMono16k(fileBuffer);
480
+ const totalSamples = audio16k.length;
481
+ const audioDuration = totalSamples / INPUT_SR;
482
+
483
+ // Chunk sizing: CPU=1000ms, GPU=30s
484
+ const chunkMs = backend === 'webgpu' ? 5000 : 1000;
485
+ const chunkHops = Math.max(1, Math.floor(chunkMs / 1000 * INPUT_SR / HOP));
486
+ const chunkSamples = chunkHops * HOP;
487
+
488
+ // Pad to HOP boundary
489
+ const pad = (HOP - totalSamples % HOP) % HOP;
490
+ let padded;
491
+ if (pad > 0) {
492
+ padded = new Float32Array(totalSamples + pad);
493
+ padded.set(audio16k);
494
+ } else {
495
+ padded = audio16k;
496
+ }
497
+ const totalPadded = padded.length;
498
+
499
+ // Init state
500
+ let state = new Float32Array(meta.total_state_size);
501
+ const outputs = [];
502
+ const numChunks = Math.ceil(totalPadded / chunkSamples);
503
+ let chunkIdx = 0;
504
+
505
+ progressLabel.textContent = 'Processing…';
506
+ const t0 = performance.now();
507
+
508
+ const srIdx = new Int32Array([TARGET_SR]);
509
+
510
+ for (let pos = 0; pos < totalPadded; pos += chunkSamples) {
511
+ const end = Math.min(pos + chunkSamples, totalPadded);
512
+ const chunk = padded.slice(pos, end);
513
+
514
+ // Shape: [1, 1, samples]
515
+ const audioTensor = new ort.Tensor('float32', chunk, [1, 1, chunk.length]);
516
+ const srTensor = new ort.Tensor('int32', srIdx, [1]);
517
+ const stateTensor = new ort.Tensor('float32', state, [meta.total_state_size]);
518
+
519
+ const result = await session.run({
520
+ audio: audioTensor,
521
+ sr_bin_idx: srTensor,
522
+ state_in: stateTensor,
523
+ });
524
+
525
+ outputs.push(new Float32Array(result.audio_out.data));
526
+ state = new Float32Array(result.state_out.data);
527
+
528
+ chunkIdx++;
529
+ const pct = Math.round(chunkIdx / numChunks * 100);
530
+ progressBar.style.width = pct + '%';
531
+ progressPct.textContent = pct + '%';
532
+
533
+ const elapsed = (performance.now() - t0) / 1000;
534
+ const processedDur = end / INPUT_SR;
535
+ const rtf = processedDur / elapsed;
536
+ rtfChip.style.display = 'flex';
537
+ rtfValue.textContent = rtf.toFixed(3) + 'x';
538
+ }
539
+
540
+ const totalElapsed = (performance.now() - t0) / 1000;
541
+ const finalRtf = audioDuration / totalElapsed;
542
+ rtfValue.textContent = finalRtf.toFixed(3) + 'x';
543
+ progressLabel.textContent = `Done in ${totalElapsed.toFixed(1)}s`;
544
+ progressPct.textContent = '100%';
545
+ progressBar.style.width = '100%';
546
+
547
+ // Concatenate outputs
548
+ const totalOut = outputs.reduce((s, a) => s + a.length, 0);
549
+ const fullOutput = new Float32Array(totalOut);
550
+ let off = 0;
551
+ for (const o of outputs) { fullOutput.set(o, off); off += o.length; }
552
+
553
+ // Trim to expected length
554
+ const expectedLen = Math.round(audioDuration * TARGET_SR);
555
+ const trimmed = fullOutput.slice(0, expectedLen);
556
+
557
+ // Encode WAV
558
+ const wav = encodeWav(trimmed, TARGET_SR);
559
+ const blob = new Blob([wav], { type: 'audio/wav' });
560
+ const url = URL.createObjectURL(blob);
561
+
562
+ audioPlayer.src = url;
563
+ const outName = fileName.replace(/\.[^.]+$/, '') + '_48k.wav';
564
+ downloadLink.href = url;
565
+ downloadLink.download = outName;
566
+ downloadLink.textContent = 'Download ' + outName;
567
+ document.getElementById('outputPanel').style.display = 'block';
568
+ processBtn.disabled = false;
569
+ });
570
+
571
+ function encodeWav(samples, sr) {
572
+ const len = samples.length;
573
+ const buf = new ArrayBuffer(44 + len * 2);
574
+ const view = new DataView(buf);
575
+ const writeStr = (o, s) => { for (let i = 0; i < s.length; i++) view.setUint8(o + i, s.charCodeAt(i)); };
576
+ writeStr(0, 'RIFF');
577
+ view.setUint32(4, 36 + len * 2, true);
578
+ writeStr(8, 'WAVE');
579
+ writeStr(12, 'fmt ');
580
+ view.setUint32(16, 16, true);
581
+ view.setUint16(20, 1, true);
582
+ view.setUint16(22, 1, true);
583
+ view.setUint32(24, sr, true);
584
+ view.setUint32(28, sr * 2, true);
585
+ view.setUint16(32, 2, true);
586
+ view.setUint16(34, 16, true);
587
+ writeStr(36, 'data');
588
+ view.setUint32(40, len * 2, true);
589
+ for (let i = 0; i < len; i++) {
590
+ let s = Math.max(-1, Math.min(1, samples[i]));
591
+ view.setInt16(44 + i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
592
+ }
593
+ return buf;
594
+ }
595
+
596
+ // Load ORT and init
597
+ const script = document.createElement('script');
598
+ script.src = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/ort.min.js';
599
+ script.crossOrigin = 'anonymous';
600
+ script.onload = () => {
601
+ ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
602
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/';
603
+ init();
604
+ };
605
+ document.head.appendChild(script);
606
+ </script>
607
+ </body>
608
+ </html>
onnx/meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"enc_state_size": 167686, "dec_state_size": 170816, "total_state_size": 338502}
onnx/vae_stream.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bf7509e655009f0740da3a80efe37c9cded555261c8893732446fe0223349b9
3
+ size 376433378
server.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple HTTP server with Cross-Origin Isolation headers for SharedArrayBuffer.
4
+ This enables multi-threading in ONNX Runtime Web for much better performance.
5
+ """
6
+
7
+ import http.server
8
+ import socketserver
9
+ import sys
10
+
11
+ PORT = int(sys.argv[1]) if len(sys.argv) > 1 else 8080
12
+
13
+
14
+ class CORSRequestHandler(http.server.SimpleHTTPRequestHandler):
15
+ def end_headers(self):
16
+ # Required for SharedArrayBuffer (enables multi-threading in WASM)
17
+ self.send_header('Cross-Origin-Opener-Policy', 'same-origin')
18
+ self.send_header('Cross-Origin-Embedder-Policy', 'require-corp')
19
+ # Allow loading from CDN
20
+ self.send_header('Access-Control-Allow-Origin', '*')
21
+ super().end_headers()
22
+
23
+ def do_OPTIONS(self):
24
+ self.send_response(200)
25
+ self.end_headers()
26
+
27
+
28
+ if __name__ == '__main__':
29
+ class ReusableTCPServer(socketserver.TCPServer):
30
+ allow_reuse_address = True
31
+ with ReusableTCPServer(("", PORT), CORSRequestHandler) as httpd:
32
+ print(f"Serving at http://localhost:{PORT}")
33
+ print("Cross-Origin Isolation headers enabled for multi-threading")
34
+ print("Press Ctrl+C to stop")
35
+ try:
36
+ httpd.serve_forever()
37
+ except KeyboardInterrupt:
38
+ print("\nShutting down...")