[Bug] Speech_tokenizer' is the correct path to a directory containing a preprocessor_config.json file

#1
by jsemrau - opened

There is a bug has a bug in its internal from_pretrained logic when loading of the feature extractor.
OS:Ubuntu

OSError: Can't load feature extractor for '/[...]/.cache/models--Qwen--Qwen3-TTS-12Hz-1.7B-CustomVoice/snapshots/b611c9f8f2ad5c741ed9c7a0a6a3750e43e0dfd7/speech_tokenizer'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name.

Otherwise, make sure '[...]/.cache/models--Qwen--Qwen3-TTS-12Hz-1.7B-CustomVoice/snapshots/b611c9f8f2ad5c741ed9c7a0a6a3750e43e0dfd7/speech_tokenizer' is the correct path to a directory containing a preprocessor_config.json file

The default script works for me with this script:

import soundfile as sf
from qwen_tts import Qwen3TTSModel
from huggingface_hub import snapshot_download
import os

# 1. Ensure all files are downloaded to a single path
model_id = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
model_path = snapshot_download(repo_id=model_id)

# 2. Fix the directory structure issue
# The code expects files inside 'speech_tokenizer' subfolder, 
# so we create a symbolic link to the root if it doesn't exist.
tokenizer_path = os.path.join(model_path, "speech_tokenizer")
if not os.path.exists(tokenizer_path):
    os.makedirs(tokenizer_path, exist_ok=True)
    # Link root config files into the subfolder the model is looking at
    for config_file in ["preprocessor_config.json", "config.json"]:
        src = os.path.join(model_path, config_file)
        dst = os.path.join(tokenizer_path, config_file)
        if os.path.exists(src) and not os.path.exists(dst):
            os.symlink(src, dst)

# 3. Load the model using the local fixed path
model = Qwen3TTSModel.from_pretrained(
    model_path,
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    attn_implementation="sdpa"
)

# 4. Inference
wavs, sr = model.generate_custom_voice(
    text="其实我真的有发现,我是一个特别善于观察别人情绪的人。",
    language="Chinese",
    speaker="Vivian",
    instruction="用特别愤怒的语气说" 
)
sf.write("output_fixed.wav", wavs[0], sr)

@jsemrau already shared fixed code and thanks to him it works again.

i want to still share my solution

download models and fix filesystem and store local paths in modal_paths.json to load later

# download_models.py
from huggingface_hub import snapshot_download
import os, json

def download_and_fix(repo_id):
    model_path = snapshot_download(repo_id=repo_id)
    
    # Fix the speech_tokenizer subfolder bug
    tokenizer_path = os.path.join(model_path, "speech_tokenizer")
    os.makedirs(tokenizer_path, exist_ok=True)
    for config_file in ["preprocessor_config.json", "config.json"]:
        src = os.path.join(model_path, config_file)
        dst = os.path.join(tokenizer_path, config_file)
        if os.path.exists(src) and not os.path.exists(dst):
            os.symlink(src, dst)
    
    print(f"Ready: {repo_id}{model_path}")
    return model_path

paths = {
    "design": download_and_fix("Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"),
    "clone":  download_and_fix("Qwen/Qwen3-TTS-12Hz-1.7B-Base"),
}

# Save paths so main.py can read them
with open("/app/model_paths.json", "w") as f:
    json.dump(paths, f)

print("Done!", paths)

load local modals paths with fixed cache path

# Models (loaded once at startup)
design_model = None
clone_model = None

global design_model, clone_model, startup_time
startup_time = datetime.now().isoformat()

with open("/app/model_paths.json") as f:
    paths = json.load(f)

print("Loading VoiceDesign model...")
design_model = Qwen3TTSModel.from_pretrained(
    paths["design"],   # absolute local path, not "Qwen/..."
    device_map="cuda:0",
    dtype=torch.bfloat16,
)

print("Loading Clone model...")
clone_model = Qwen3TTSModel.from_pretrained(
    paths["clone"],    # absolute local path
    device_map="cuda:0",
    dtype=torch.bfloat16,
)
print("Models loaded!")

also some extra for the real ogs Dockerfile and docker compose.yml

FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04

# Set environment variables
ENV PYTHONUNBUFFERED=1 \
    DEBIAN_FRONTEND=noninteractive \
    CUDA_HOME=/usr/local/cuda \
    HF_HOME=/root/.cache/huggingface \
    TRANSFORMERS_CACHE=/root/.cache/huggingface

# Install Python 3.12 and system dependencies
RUN apt-get update && apt-get install -y \
    software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update && apt-get install -y \
    python3.12 \
    python3.12-dev \
    python3.12-venv \
    git \
    wget \
    curl \
    libsndfile1 \
    ffmpeg \
    sox \
    && rm -rf /var/lib/apt/lists/*

# Set Python 3.12 as default
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1

# Install pip for Python 3.12
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12

# Install setuptools
RUN pip install setuptools

# Set working directory
WORKDIR /app

# Install PyTorch for CUDA 12.1 FIRST, before other packages
RUN pip install --no-cache-dir \
    torch \
    torchaudio \
    --index-url https://download.pytorch.org/whl/cu121

# Install Python dependencies
RUN pip install --no-cache-dir \
    fastapi \
    uvicorn[standard] \
    pydantic \
    soundfile \
    transformers \
    accelerate \
    huggingface_hub \
    qwen-tts


# Copy application code
COPY . /app

# Expose port
EXPOSE 8000

# Run the application
CMD ["sh", "-c", "python download_models.py && python main.py"]
servivces:
  python-tts-api:
    build:
      context: ./python-tts-api
      dockerfile: Dockerfile
    volumes:
      - ./python-tts-api/voices:/app/voices
      - ./python-tts-api/hf_cache:/root/.cache/huggingface 
    deploy:
      replicas: 1
      resources:
        limits:
          cpus: '4'
          memory: 8G
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities:
                - gpu
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 60s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"

Sign up or log in to comment