| ---
|
| base_model:
|
| - Qwen/Qwen3-0.6B
|
| language:
|
| - aae
|
| - aal
|
| - aao
|
| - ab
|
| - abb
|
| - abn
|
| - abr
|
| - abs
|
| - abv
|
| - acm
|
| - acw
|
| - acx
|
| - adf
|
| - adx
|
| - ady
|
| - aeb
|
| - aec
|
| - af
|
| - afb
|
| - afo
|
| - ahl
|
| - ahs
|
| - ajg
|
| - aju
|
| - ala
|
| - aln
|
| - alo
|
| - am
|
| - amu
|
| - an
|
| - anc
|
| - ank
|
| - anp
|
| - anw
|
| - aom
|
| - apc
|
| - apd
|
| - arb
|
| - arq
|
| - ars
|
| - ary
|
| - arz
|
| - as
|
| - ast
|
| - avl
|
| - awo
|
| - ayl
|
| - ayp
|
| - az
|
| - ba
|
| - bag
|
| - bas
|
| - bax
|
| - bba
|
| - bbj
|
| - bbl
|
| - bbu
|
| - bce
|
| - bci
|
| - bcs
|
| - bcy
|
| - bda
|
| - bde
|
| - bdm
|
| - be
|
| - beb
|
| - bew
|
| - bfd
|
| - bft
|
| - bg
|
| - bgp
|
| - bhb
|
| - bhh
|
| - bho
|
| - bhp
|
| - bhr
|
| - bjj
|
| - bjk
|
| - bjn
|
| - bjt
|
| - bkh
|
| - bkm
|
| - bky
|
| - bmm
|
| - bmq
|
| - bn
|
| - bnm
|
| - bnn
|
| - bns
|
| - bo
|
| - bou
|
| - bqg
|
| - br
|
| - bra
|
| - brh
|
| - bri
|
| - brx
|
| - bs
|
| - bsh
|
| - bsj
|
| - bsk
|
| - btm
|
| - btv
|
| - bug
|
| - bum
|
| - buo
|
| - bux
|
| - bwr
|
| - bxf
|
| - byc
|
| - bys
|
| - byv
|
| - byx
|
| - bzc
|
| - bzw
|
| - ca
|
| - ccg
|
| - ceb
|
| - cen
|
| - cfa
|
| - cgg
|
| - chq
|
| - cjk
|
| - ckb
|
| - ckl
|
| - ckr
|
| - cky
|
| - cnh
|
| - cpy
|
| - cs
|
| - cte
|
| - ctl
|
| - cut
|
| - cux
|
| - cv
|
| - cy
|
| - da
|
| - dag
|
| - dar
|
| - dav
|
| - dbd
|
| - dcc
|
| - de
|
| - deg
|
| - dgh
|
| - dgo
|
| - dje
|
| - dmk
|
| - dml
|
| - dru
|
| - dty
|
| - dua
|
| - dv
|
| - dyu
|
| - dzg
|
| - ebr
|
| - ebu
|
| - ego
|
| - eiv
|
| - eko
|
| - ekr
|
| - el
|
| - elm
|
| - en
|
| - eo
|
| - es
|
| - esu
|
| - et
|
| - eto
|
| - ets
|
| - etu
|
| - eu
|
| - ewo
|
| - ext
|
| - eyo
|
| - fa
|
| - fan
|
| - fat
|
| - ff
|
| - ffm
|
| - fi
|
| - fia
|
| - fil
|
| - fip
|
| - fkk
|
| - fmp
|
| - fr
|
| - fub
|
| - fuc
|
| - fue
|
| - fuf
|
| - fuh
|
| - fui
|
| - fuq
|
| - fuv
|
| - fy
|
| - ga
|
| - gbm
|
| - gbr
|
| - gby
|
| - gcc
|
| - gdf
|
| - gej
|
| - ges
|
| - ggg
|
| - gid
|
| - gig
|
| - giz
|
| - gjk
|
| - gju
|
| - gl
|
| - glw
|
| - gn
|
| - gol
|
| - gom
|
| - gsl
|
| - gu
|
| - gui
|
| - gur
|
| - guz
|
| - gv
|
| - gwc
|
| - gwe
|
| - gwt
|
| - gya
|
| - gyz
|
| - ha
|
| - hah
|
| - hao
|
| - haw
|
| - haz
|
| - hbb
|
| - he
|
| - hem
|
| - hi
|
| - hia
|
| - hkk
|
| - hla
|
| - hno
|
| - hoj
|
| - hr
|
| - hsb
|
| - ht
|
| - hu
|
| - hue
|
| - hul
|
| - hux
|
| - hwo
|
| - hy
|
| - hz
|
| - ia
|
| - ibb
|
| - id
|
| - ida
|
| - idu
|
| - ig
|
| - ijc
|
| - ijn
|
| - ik
|
| - ikw
|
| - is
|
| - ish
|
| - iso
|
| - it
|
| - its
|
| - itw
|
| - itz
|
| - ja
|
| - jal
|
| - jax
|
| - jgo
|
| - jmx
|
| - jns
|
| - jqr
|
| - juk
|
| - juo
|
| - jv
|
| - ka
|
| - kab
|
| - kai
|
| - kaj
|
| - kam
|
| - kbd
|
| - kbl
|
| - kbt
|
| - kcq
|
| - kdh
|
| - kea
|
| - keu
|
| - kfe
|
| - kfk
|
| - kfp
|
| - khg
|
| - khw
|
| - kj
|
| - kjc
|
| - kjk
|
| - kk
|
| - kln
|
| - kls
|
| - km
|
| - kmr
|
| - kmy
|
| - kn
|
| - kna
|
| - knn
|
| - ko
|
| - kol
|
| - koo
|
| - kpo
|
| - kqo
|
| - ks
|
| - ksd
|
| - ksf
|
| - kto
|
| - kuh
|
| - kvx
|
| - kw
|
| - kwm
|
| - kxp
|
| - ky
|
| - kyx
|
| - lag
|
| - lb
|
| - lcm
|
| - ldb
|
| - lg
|
| - lij
|
| - lir
|
| - lkb
|
| - lla
|
| - ln
|
| - lnu
|
| - lo
|
| - loa
|
| - lrk
|
| - lss
|
| - lt
|
| - ltg
|
| - lto
|
| - lua
|
| - luo
|
| - lus
|
| - lv
|
| - lwg
|
| - mab
|
| - maf
|
| - mai
|
| - mau
|
| - max
|
| - mbo
|
| - mcf
|
| - mcn
|
| - mcx
|
| - mdd
|
| - mde
|
| - mdf
|
| - mek
|
| - mer
|
| - meu
|
| - mfm
|
| - mfn
|
| - mfo
|
| - mfv
|
| - mgg
|
| - mgi
|
| - mhk
|
| - mhr
|
| - mi
|
| - mig
|
| - miu
|
| - mk
|
| - mkf
|
| - mki
|
| - ml
|
| - mlq
|
| - mn
|
| - mne
|
| - mni
|
| - mqy
|
| - mr
|
| - mrj
|
| - mrr
|
| - mrt
|
| - ms
|
| - mse
|
| - msh
|
| - msw
|
| - mt
|
| - mtr
|
| - mtu
|
| - mtx
|
| - mua
|
| - mug
|
| - mui
|
| - mve
|
| - mvy
|
| - mxs
|
| - mxu
|
| - mxy
|
| - my
|
| - myv
|
| - mzl
|
| - nal
|
| - nan
|
| - nap
|
| - nb
|
| - nbh
|
| - ncf
|
| - nco
|
| - ncx
|
| - ndi
|
| - ng
|
| - ngi
|
| - nhg
|
| - nhi
|
| - nhn
|
| - nhq
|
| - nja
|
| - nl
|
| - nla
|
| - nlv
|
| - nmg
|
| - nmz
|
| - nn
|
| - nnh
|
| - 'no'
|
| - noe
|
| - npi
|
| - nso
|
| - ny
|
| - nyu
|
| - oc
|
| - odk
|
| - odu
|
| - ogo
|
| - om
|
| - orc
|
| - oru
|
| - ory
|
| - os
|
| - pa
|
| - pbs
|
| - pbt
|
| - pbu
|
| - pcm
|
| - pex
|
| - phl
|
| - phr
|
| - pip
|
| - piy
|
| - pko
|
| - pl
|
| - plk
|
| - plt
|
| - pmq
|
| - pms
|
| - pmy
|
| - pnb
|
| - poc
|
| - poe
|
| - pow
|
| - prq
|
| - ps
|
| - pst
|
| - pt
|
| - pua
|
| - pwn
|
| - qug
|
| - qum
|
| - qup
|
| - qur
|
| - qus
|
| - quv
|
| - qux
|
| - quy
|
| - qva
|
| - qvi
|
| - qvj
|
| - qvl
|
| - qwa
|
| - qws
|
| - qxa
|
| - qxp
|
| - qxt
|
| - qxu
|
| - qxw
|
| - rag
|
| - rm
|
| - ro
|
| - rob
|
| - rof
|
| - roo
|
| - rth
|
| - ru
|
| - rup
|
| - rw
|
| - sa
|
| - sah
|
| - sat
|
| - sau
|
| - say
|
| - sbn
|
| - sc
|
| - scl
|
| - scn
|
| - sd
|
| - sei
|
| - shu
|
| - si
|
| - sip
|
| - siw
|
| - sjr
|
| - sk
|
| - skg
|
| - skr
|
| - sl
|
| - sn
|
| - snc
|
| - snk
|
| - so
|
| - sol
|
| - sps
|
| - sq
|
| - sr
|
| - src
|
| - sro
|
| - ssi
|
| - ste
|
| - sua
|
| - sv
|
| - sva
|
| - sw
|
| - szy
|
| - ta
|
| - tan
|
| - tar
|
| - tay
|
| - tbf
|
| - tcf
|
| - tcy
|
| - tdn
|
| - tdx
|
| - te
|
| - tg
|
| - tgc
|
| - th
|
| - the
|
| - thq
|
| - thr
|
| - thv
|
| - ti
|
| - tig
|
| - tio
|
| - tk
|
| - tkg
|
| - tkt
|
| - tli
|
| - tlp
|
| - tn
|
| - tok
|
| - tpl
|
| - tpz
|
| - tqp
|
| - tr
|
| - trp
|
| - trq
|
| - trv
|
| - trw
|
| - tt
|
| - ttj
|
| - ttr
|
| - ttu
|
| - tui
|
| - tul
|
| - tuq
|
| - tuv
|
| - tuy
|
| - tvo
|
| - tvu
|
| - tw
|
| - twu
|
| - txs
|
| - txy
|
| - udl
|
| - ug
|
| - uk
|
| - uki
|
| - umb
|
| - ur
|
| - ush
|
| - uz
|
| - uzn
|
| - vai
|
| - var
|
| - ver
|
| - vi
|
| - vmc
|
| - vmj
|
| - vmm
|
| - vmp
|
| - vmz
|
| - vot
|
| - vro
|
| - wbl
|
| - wci
|
| - weo
|
| - wes
|
| - wja
|
| - wji
|
| - wo
|
| - wof
|
| - xh
|
| - xhe
|
| - xka
|
| - xmf
|
| - xmv
|
| - xmw
|
| - xpe
|
| - xti
|
| - xtu
|
| - yaq
|
| - yav
|
| - yay
|
| - ydd
|
| - ydg
|
| - yer
|
| - 'yes'
|
| - yi
|
| - yo
|
| - yue
|
| - zga
|
| - zgh
|
| - zh
|
| - zoc
|
| - zoh
|
| - zor
|
| - zpv
|
| - zpy
|
| - ztg
|
| - ztn
|
| - ztp
|
| - zts
|
| - ztu
|
| - zu
|
| - zza
|
| license: apache-2.0
|
| pipeline_tag: text-to-speech
|
| tags:
|
| - zero-shot
|
| - multilingual
|
| - voice-cloning
|
| - voice-design
|
| ---
|
|
|
| # OmniVoice 🌍
|
|
|
| <p align="center">
|
| <img width="200" height="200" alt="OmniVoice" src="https://zhu-han.github.io/omnivoice/pics/omnivoice.jpg" />
|
| </p>
|
|
|
| <p align="center">
|
| <a href="https://huggingface.co/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-FFD21E" alt="Hugging Face Model"></a>
|
|
|
| <a href="https://huggingface.co/spaces/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue" alt="Hugging Face Space"></a>
|
|
|
| <a href="https://huggingface.co/papers/2604.00688"><img src="https://img.shields.io/badge/arXiv-Paper-B31B1B.svg"></a>
|
|
|
| <a href="https://github.com/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/GitHub-Code-181717?logo=GitHub" alt="GitHub Code"></a>
|
|
|
| <a href="https://zhu-han.github.io/omnivoice"><img src="https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=GitHub&style=flat-square"></a>
|
| </p>
|
|
|
|
|
| OmniVoice is a massive multilingual zero-shot text-to-speech (TTS) model supporting over 600 languages. Built on a novel diffusion language model-style architecture, it delivers high-quality speech with superior inference speed, supporting voice cloning and voice design.
|
|
|
| - **Paper:** [OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models](https://huggingface.co/papers/2604.00688)
|
| - **Repository:** [GitHub](https://github.com/k2-fsa/OmniVoice)
|
| - **Demo:** [Hugging Face Space](https://huggingface.co/spaces/k2-fsa/OmniVoice)
|
|
|
| ## Key Features
|
|
|
| - **600+ Languages Supported**: The broadest language coverage among zero-shot TTS models.
|
| - **Voice Cloning**: State-of-the-art voice cloning quality from a short reference audio.
|
| - **Voice Design**: Control voices via assigned speaker attributes (gender, age, pitch, dialect/accent, whisper, etc.).
|
| - **Fine-grained Control**: Non-verbal symbols (e.g., `[laughter]`) and pronunciation correction via pinyin or phonemes.
|
| - **Fast Inference**: RTF as low as 0.025 (40x faster than real-time).
|
| - **Diffusion Language Model-style Architecture**: A clean, streamlined, and scalable design that delivers both quality and speed.
|
|
|
| ## Sample Usage
|
|
|
| To get started, install the `omnivoice` library:
|
|
|
| > We recommend using a fresh virtual environment (e.g., `conda`, `venv`, etc.) to avoid conflicts.
|
|
|
| **Step 1**: Install PyTorch
|
|
|
| <details>
|
| <summary>NVIDIA GPU</summary>
|
|
|
| ```bash
|
| # Install pytorch with your CUDA version, e.g.
|
| pip install torch==2.8.0+cu128 torchaudio==2.8.0+cu128 --extra-index-url https://download.pytorch.org/whl/cu128
|
| ```
|
| > See [PyTorch official site](https://pytorch.org/get-started/locally/) for other versions installation.
|
|
|
| </details>
|
|
|
| <details>
|
| <summary>Apple Silicon</summary>
|
|
|
| ```bash
|
| pip install torch==2.8.0 torchaudio==2.8.0
|
| ```
|
|
|
| </details>
|
|
|
| **Step 2**: Install OmniVoice
|
|
|
| ```bash
|
| pip install omnivoice
|
| ```
|
|
|
| ### Python API
|
|
|
| You can use OmniVoice for zero-shot voice cloning as follows:
|
|
|
| ```python
|
| from omnivoice import OmniVoice
|
| import torch
|
| import torchaudio
|
|
|
| # Load the model
|
| model = OmniVoice.from_pretrained(
|
| "k2-fsa/OmniVoice",
|
| device_map="cuda:0",
|
| dtype=torch.float16
|
| )
|
|
|
| # Generate audio
|
| audio = model.generate(
|
| text="Hello, this is a test of zero-shot voice cloning.",
|
| ref_audio="ref.wav",
|
| ref_text="Transcription of the reference audio.",
|
| ) # audio is a list of `torch.Tensor` with shape (1, T) at 24 kHz.
|
|
|
| torchaudio.save("out.wav", audio[0], 24000)
|
| ```
|
|
|
| For more generation modes (e.g., voice design), functions (e.g., non-verbal symbols, pronunciation correction) and comprehensive usage instructions, see our [GitHub Repository](https://github.com/k2-fsa/OmniVoice).
|
|
|
| ## Citation
|
|
|
| ```bibtex
|
| @article{zhu2026omnivoice,
|
| title={OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models},
|
| author={Zhu, Han and Ye, Lingxuan and Kang, Wei and Yao, Zengwei and Guo, Liyong and Kuang, Fangjun and Han, Zhifeng and Zhuang, Weiji and Lin, Long and Povey, Daniel},
|
| journal={arXiv preprint arXiv:2604.00688},
|
| year={2026}
|
| }
|
| ``` |