mpoli commited on
Commit
4693bac
·
verified ·
1 Parent(s): 4c3121f

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +67 -0
  2. config.json +69 -0
  3. final.pt +3 -0
  4. full_checkpoint.pt +3 -0
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-sa-4.0
3
+ language:
4
+ - bg
5
+ - cs
6
+ - da
7
+ - el
8
+ - es
9
+ - et
10
+ - fi
11
+ - hr
12
+ - hu
13
+ - it
14
+ - lt
15
+ - lv
16
+ - mt
17
+ - nl
18
+ - pl
19
+ - pt
20
+ - ro
21
+ - sk
22
+ - sl
23
+ - sv
24
+ ---
25
+
26
+ # SpidR VP-20
27
+
28
+ SpidR VP-20 is a SpidR model pretrained pretrained on a subset of 6k hours and 20 languages of VoxPopuli
29
+ (all EU languages except English, French, and German)
30
+ for the [DiscoPhon benchmark](https://benchmarks.cognitive-ml.fr/discophon).
31
+ It was pretrained using the [`spidr`](https://github.com/facebookresearch/spidr) library.
32
+
33
+ You can load it with:
34
+
35
+ ```python
36
+ from spidr.models import SpidR
37
+ from torch.hub import load_state_dict_from_url
38
+
39
+ state_dict = load_state_dict_from_url("https://huggingface.co/coml/spidr-vp20/resolve/main/final.pt")
40
+ model = SpidR().eval()
41
+ model.load_state_dict(state_dict)
42
+ ```
43
+
44
+ ## Files:
45
+
46
+ - `config.json`: Model configuration.
47
+ - `final.pt`: Model checkpoint.
48
+ - `full_checkpoint.pt`: Full checkpoint, with model, optimizer, etc.
49
+
50
+ ## Citing
51
+
52
+ Please cite the DiscoPhon paper
53
+
54
+ ```bibtex
55
+ @misc{poli2026discophon,
56
+ title={{DiscoPhon}: Benchmarking the Unsupervised Discovery of Phoneme Inventories With Discrete Speech Units},
57
+ author={Maxime Poli and Manel Khentout and Angelo Ortiz Tandazo and Ewan Dunbar and Emmanuel Chemla and Emmanuel Dupoux},
58
+ year={2026},
59
+ eprint={2603.18612},
60
+ archivePrefix={arXiv},
61
+ primaryClass={cs.CL},
62
+ url={https://arxiv.org/abs/2603.18612},
63
+ }
64
+ ```
65
+
66
+ along with [SpidR](https://openreview.net/forum?id=E7XAFBpfZs).
67
+
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "extractor_mode": "layer_norm",
4
+ "extractor_conv_bias": false,
5
+ "extractor_conv_layer_config": [
6
+ [
7
+ 512,
8
+ 10,
9
+ 5
10
+ ],
11
+ [
12
+ 512,
13
+ 3,
14
+ 2
15
+ ],
16
+ [
17
+ 512,
18
+ 3,
19
+ 2
20
+ ],
21
+ [
22
+ 512,
23
+ 3,
24
+ 2
25
+ ],
26
+ [
27
+ 512,
28
+ 3,
29
+ 2
30
+ ],
31
+ [
32
+ 512,
33
+ 2,
34
+ 2
35
+ ],
36
+ [
37
+ 512,
38
+ 2,
39
+ 2
40
+ ]
41
+ ],
42
+ "encoder_embed_dim": 768,
43
+ "encoder_projection_dropout": 0,
44
+ "encoder_pos_conv_kernel": 95,
45
+ "encoder_pos_conv_groups": 16,
46
+ "encoder_pos_conv_depth": 5,
47
+ "encoder_num_layers": 12,
48
+ "encoder_num_heads": 12,
49
+ "encoder_attention_dropout": 0.1,
50
+ "encoder_ff_interm_features": 3072,
51
+ "encoder_ff_interm_dropout": 0.0,
52
+ "encoder_dropout": 0.1,
53
+ "encoder_layer_norm_first": false,
54
+ "encoder_layer_drop": 0.0,
55
+ "encoder_qkv_bias": false,
56
+ "codebook_size": 256,
57
+ "codebook_decay": 0.9,
58
+ "num_codebooks": 8,
59
+ "ema_start_decay": 0.999,
60
+ "ema_final_decay": 0.9999,
61
+ "ema_final_step": 30000,
62
+ "ema_exclude_layers": [
63
+ "pos_conv_embed"
64
+ ],
65
+ "freeze_step": 200000,
66
+ "ema_timescale": 20000,
67
+ "ema_threshold": 1e-07
68
+ }
69
+ }
final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d228d96def90c94a3e12e04bb6245354be960003948c905f0bc1b5c8d40ac59
3
+ size 739393927
full_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:661e0e193efd008ff26bc3eb55a7866a3269724e91fc29fb69fe989813a01e87
3
+ size 1460369619