[Bug] Speech_tokenizer' is the correct path to a directory containing a preprocessor_config.json file
There is a bug has a bug in its internal from_pretrained logic when loading of the feature extractor.
OS:Ubuntu
OSError: Can't load feature extractor for '/[...]/.cache/models--Qwen--Qwen3-TTS-12Hz-1.7B-CustomVoice/snapshots/b611c9f8f2ad5c741ed9c7a0a6a3750e43e0dfd7/speech_tokenizer'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name.
Otherwise, make sure '[...]/.cache/models--Qwen--Qwen3-TTS-12Hz-1.7B-CustomVoice/snapshots/b611c9f8f2ad5c741ed9c7a0a6a3750e43e0dfd7/speech_tokenizer' is the correct path to a directory containing a preprocessor_config.json file
The default script works for me with this script:
import soundfile as sf
from qwen_tts import Qwen3TTSModel
from huggingface_hub import snapshot_download
import os
# 1. Ensure all files are downloaded to a single path
model_id = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
model_path = snapshot_download(repo_id=model_id)
# 2. Fix the directory structure issue
# The code expects files inside 'speech_tokenizer' subfolder,
# so we create a symbolic link to the root if it doesn't exist.
tokenizer_path = os.path.join(model_path, "speech_tokenizer")
if not os.path.exists(tokenizer_path):
os.makedirs(tokenizer_path, exist_ok=True)
# Link root config files into the subfolder the model is looking at
for config_file in ["preprocessor_config.json", "config.json"]:
src = os.path.join(model_path, config_file)
dst = os.path.join(tokenizer_path, config_file)
if os.path.exists(src) and not os.path.exists(dst):
os.symlink(src, dst)
# 3. Load the model using the local fixed path
model = Qwen3TTSModel.from_pretrained(
model_path,
device_map="cuda:0",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
attn_implementation="sdpa"
)
# 4. Inference
wavs, sr = model.generate_custom_voice(
text="其实我真的有发现,我是一个特别善于观察别人情绪的人。",
language="Chinese",
speaker="Vivian",
instruction="用特别愤怒的语气说"
)
sf.write("output_fixed.wav", wavs[0], sr)
@jsemrau already shared fixed code and thanks to him it works again.
i want to still share my solution
download models and fix filesystem and store local paths in modal_paths.json to load later
# download_models.py
from huggingface_hub import snapshot_download
import os, json
def download_and_fix(repo_id):
model_path = snapshot_download(repo_id=repo_id)
# Fix the speech_tokenizer subfolder bug
tokenizer_path = os.path.join(model_path, "speech_tokenizer")
os.makedirs(tokenizer_path, exist_ok=True)
for config_file in ["preprocessor_config.json", "config.json"]:
src = os.path.join(model_path, config_file)
dst = os.path.join(tokenizer_path, config_file)
if os.path.exists(src) and not os.path.exists(dst):
os.symlink(src, dst)
print(f"Ready: {repo_id} → {model_path}")
return model_path
paths = {
"design": download_and_fix("Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"),
"clone": download_and_fix("Qwen/Qwen3-TTS-12Hz-1.7B-Base"),
}
# Save paths so main.py can read them
with open("/app/model_paths.json", "w") as f:
json.dump(paths, f)
print("Done!", paths)
load local modals paths with fixed cache path
# Models (loaded once at startup)
design_model = None
clone_model = None
global design_model, clone_model, startup_time
startup_time = datetime.now().isoformat()
with open("/app/model_paths.json") as f:
paths = json.load(f)
print("Loading VoiceDesign model...")
design_model = Qwen3TTSModel.from_pretrained(
paths["design"], # absolute local path, not "Qwen/..."
device_map="cuda:0",
dtype=torch.bfloat16,
)
print("Loading Clone model...")
clone_model = Qwen3TTSModel.from_pretrained(
paths["clone"], # absolute local path
device_map="cuda:0",
dtype=torch.bfloat16,
)
print("Models loaded!")
also some extra for the real ogs Dockerfile and docker compose.yml
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
# Set environment variables
ENV PYTHONUNBUFFERED=1 \
DEBIAN_FRONTEND=noninteractive \
CUDA_HOME=/usr/local/cuda \
HF_HOME=/root/.cache/huggingface \
TRANSFORMERS_CACHE=/root/.cache/huggingface
# Install Python 3.12 and system dependencies
RUN apt-get update && apt-get install -y \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update && apt-get install -y \
python3.12 \
python3.12-dev \
python3.12-venv \
git \
wget \
curl \
libsndfile1 \
ffmpeg \
sox \
&& rm -rf /var/lib/apt/lists/*
# Set Python 3.12 as default
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
# Install pip for Python 3.12
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
# Install setuptools
RUN pip install setuptools
# Set working directory
WORKDIR /app
# Install PyTorch for CUDA 12.1 FIRST, before other packages
RUN pip install --no-cache-dir \
torch \
torchaudio \
--index-url https://download.pytorch.org/whl/cu121
# Install Python dependencies
RUN pip install --no-cache-dir \
fastapi \
uvicorn[standard] \
pydantic \
soundfile \
transformers \
accelerate \
huggingface_hub \
qwen-tts
# Copy application code
COPY . /app
# Expose port
EXPOSE 8000
# Run the application
CMD ["sh", "-c", "python download_models.py && python main.py"]
servivces:
python-tts-api:
build:
context: ./python-tts-api
dockerfile: Dockerfile
volumes:
- ./python-tts-api/voices:/app/voices
- ./python-tts-api/hf_cache:/root/.cache/huggingface
deploy:
replicas: 1
resources:
limits:
cpus: '4'
memory: 8G
reservations:
devices:
- driver: nvidia
count: all
capabilities:
- gpu
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 60s
timeout: 10s
retries: 3
restart: unless-stopped
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"