commit
Browse files- .gitignore +14 -0
- app.py +23 -7
- realtime_server.py +123 -32
- speech_io.py +51 -27
.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
*$py.class
|
| 4 |
+
.DS_Store
|
| 5 |
+
*.ipynb_checkpoints/
|
| 6 |
+
env/
|
| 7 |
+
.venv/
|
| 8 |
+
venv/
|
| 9 |
+
node_modules/
|
| 10 |
+
.trae/
|
| 11 |
+
*.wav
|
| 12 |
+
*.mp3
|
| 13 |
+
*.flac
|
| 14 |
+
*.ogg
|
app.py
CHANGED
|
@@ -13,6 +13,8 @@ from llm import load_llm
|
|
| 13 |
from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
|
| 14 |
|
| 15 |
from speech_io import transcribe_audio, synthesize_speech
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# =====================================================
|
| 18 |
# INITIALISIERUNG (global)
|
|
@@ -97,7 +99,7 @@ def chatbot_text(user_message, history):
|
|
| 97 |
# VOICE CHATBOT
|
| 98 |
# =====================================================
|
| 99 |
|
| 100 |
-
def chatbot_voice(audio_path, history):
|
| 101 |
# 1. Speech → Text
|
| 102 |
text = transcribe_audio(audio_path)
|
| 103 |
if not text:
|
|
@@ -118,7 +120,7 @@ def chatbot_voice(audio_path, history):
|
|
| 118 |
history = history + [{"role": "assistant", "content": bot_msg}]
|
| 119 |
|
| 120 |
# 3. Text → Speech
|
| 121 |
-
audio = synthesize_speech(bot_msg)
|
| 122 |
|
| 123 |
return history, audio, ""
|
| 124 |
|
|
@@ -126,13 +128,13 @@ def chatbot_voice(audio_path, history):
|
|
| 126 |
# LAST ANSWER → TTS
|
| 127 |
# =====================================================
|
| 128 |
|
| 129 |
-
def read_last_answer(history):
|
| 130 |
if not history:
|
| 131 |
return None
|
| 132 |
|
| 133 |
for msg in reversed(history):
|
| 134 |
if msg["role"] == "assistant":
|
| 135 |
-
return synthesize_speech(msg["content"])
|
| 136 |
|
| 137 |
return None
|
| 138 |
|
|
@@ -175,21 +177,36 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
|
|
| 175 |
gr.Markdown("### 🎙️ Spracheingabe")
|
| 176 |
voice_in = gr.Audio(sources=["microphone"], type="filepath")
|
| 177 |
voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
voice_btn = gr.Button("Sprechen & senden")
|
| 180 |
voice_btn.click(
|
| 181 |
chatbot_voice,
|
| 182 |
-
[voice_in, chatbot],
|
| 183 |
[chatbot, voice_out, msg]
|
| 184 |
)
|
| 185 |
|
| 186 |
read_btn = gr.Button("🔁 Antwort erneut vorlesen")
|
| 187 |
read_btn.click(
|
| 188 |
read_last_answer,
|
| 189 |
-
[chatbot],
|
| 190 |
[voice_out]
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
clear_btn = gr.Button("Chat zurücksetzen")
|
| 194 |
clear_btn.click(lambda: [], None, chatbot)
|
| 195 |
|
|
@@ -208,4 +225,3 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
|
|
| 208 |
|
| 209 |
if __name__ == "__main__":
|
| 210 |
demo.launch()
|
| 211 |
-
|
|
|
|
| 13 |
from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
|
| 14 |
|
| 15 |
from speech_io import transcribe_audio, synthesize_speech
|
| 16 |
+
import sys
|
| 17 |
+
sys.dont_write_bytecode = True
|
| 18 |
|
| 19 |
# =====================================================
|
| 20 |
# INITIALISIERUNG (global)
|
|
|
|
| 99 |
# VOICE CHATBOT
|
| 100 |
# =====================================================
|
| 101 |
|
| 102 |
+
def chatbot_voice(audio_path, history, tts_model_id):
|
| 103 |
# 1. Speech → Text
|
| 104 |
text = transcribe_audio(audio_path)
|
| 105 |
if not text:
|
|
|
|
| 120 |
history = history + [{"role": "assistant", "content": bot_msg}]
|
| 121 |
|
| 122 |
# 3. Text → Speech
|
| 123 |
+
audio = synthesize_speech(bot_msg, tts_model_id)
|
| 124 |
|
| 125 |
return history, audio, ""
|
| 126 |
|
|
|
|
| 128 |
# LAST ANSWER → TTS
|
| 129 |
# =====================================================
|
| 130 |
|
| 131 |
+
def read_last_answer(history, tts_model_id=None):
|
| 132 |
if not history:
|
| 133 |
return None
|
| 134 |
|
| 135 |
for msg in reversed(history):
|
| 136 |
if msg["role"] == "assistant":
|
| 137 |
+
return synthesize_speech(msg["content"], tts_model_id)
|
| 138 |
|
| 139 |
return None
|
| 140 |
|
|
|
|
| 177 |
gr.Markdown("### 🎙️ Spracheingabe")
|
| 178 |
voice_in = gr.Audio(sources=["microphone"], type="filepath")
|
| 179 |
voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
|
| 180 |
+
tts_lang = gr.Dropdown(
|
| 181 |
+
label="TTS Sprache",
|
| 182 |
+
choices=[
|
| 183 |
+
"facebook/mms-tts-deu",
|
| 184 |
+
"facebook/mms-tts-vie",
|
| 185 |
+
"facebook/mms-tts-eng",
|
| 186 |
+
],
|
| 187 |
+
value="facebook/mms-tts-deu",
|
| 188 |
+
)
|
| 189 |
|
| 190 |
voice_btn = gr.Button("Sprechen & senden")
|
| 191 |
voice_btn.click(
|
| 192 |
chatbot_voice,
|
| 193 |
+
[voice_in, chatbot, tts_lang],
|
| 194 |
[chatbot, voice_out, msg]
|
| 195 |
)
|
| 196 |
|
| 197 |
read_btn = gr.Button("🔁 Antwort erneut vorlesen")
|
| 198 |
read_btn.click(
|
| 199 |
read_last_answer,
|
| 200 |
+
[chatbot, tts_lang],
|
| 201 |
[voice_out]
|
| 202 |
)
|
| 203 |
|
| 204 |
+
gr.Markdown("### ⚡ Voice (Realtime) – thử nghiệm")
|
| 205 |
+
gr.Markdown("Sử dụng OpenAI Realtime API cho hội thoại nói. Mở trang test bên dưới.")
|
| 206 |
+
gr.HTML("""
|
| 207 |
+
<iframe src="http://localhost:8000/" style="width:100%;height:300px;border:1px solid #ccc"></iframe>
|
| 208 |
+
""")
|
| 209 |
+
|
| 210 |
clear_btn = gr.Button("Chat zurücksetzen")
|
| 211 |
clear_btn.click(lambda: [], None, chatbot)
|
| 212 |
|
|
|
|
| 225 |
|
| 226 |
if __name__ == "__main__":
|
| 227 |
demo.launch()
|
|
|
realtime_server.py
CHANGED
|
@@ -11,6 +11,8 @@ from typing import Optional
|
|
| 11 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 12 |
from fastapi.responses import HTMLResponse
|
| 13 |
import websockets
|
|
|
|
|
|
|
| 14 |
|
| 15 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 16 |
OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
|
|
@@ -29,16 +31,64 @@ html = """
|
|
| 29 |
<button id="startBtn">Start Recording</button>
|
| 30 |
<button id="stopBtn" disabled>Stop Recording</button>
|
| 31 |
<div id="status">Status: Ready</div>
|
|
|
|
| 32 |
<div id="transcript"></div>
|
|
|
|
| 33 |
|
| 34 |
<script>
|
| 35 |
let mediaRecorder;
|
| 36 |
let audioChunks = [];
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
document.getElementById('startBtn').onclick = async () => {
|
|
|
|
| 39 |
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
| 40 |
mediaRecorder = new MediaRecorder(stream);
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
mediaRecorder.ondataavailable = (event) => {
|
| 43 |
audioChunks.push(event.data);
|
| 44 |
};
|
|
@@ -46,23 +96,13 @@ html = """
|
|
| 46 |
mediaRecorder.onstop = async () => {
|
| 47 |
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
|
| 48 |
audioChunks = [];
|
| 49 |
-
|
| 50 |
-
// Convert to base64
|
| 51 |
const reader = new FileReader();
|
| 52 |
reader.readAsDataURL(audioBlob);
|
| 53 |
reader.onloadend = () => {
|
| 54 |
const base64data = reader.result.split(',')[1];
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
headers: { 'Content-Type': 'application/json' },
|
| 59 |
-
body: JSON.stringify({ audio: base64data })
|
| 60 |
-
})
|
| 61 |
-
.then(response => response.json())
|
| 62 |
-
.then(data => {
|
| 63 |
-
document.getElementById('transcript').innerHTML =
|
| 64 |
-
`<strong>Transkription:</strong> ${data.transcript}`;
|
| 65 |
-
});
|
| 66 |
};
|
| 67 |
};
|
| 68 |
|
|
@@ -76,7 +116,6 @@ html = """
|
|
| 76 |
mediaRecorder.stop();
|
| 77 |
document.getElementById('startBtn').disabled = false;
|
| 78 |
document.getElementById('stopBtn').disabled = true;
|
| 79 |
-
document.getElementById('status').textContent = 'Status: Processing...';
|
| 80 |
};
|
| 81 |
</script>
|
| 82 |
</body>
|
|
@@ -136,26 +175,78 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 136 |
extra_headers=headers
|
| 137 |
) as openai_ws:
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
try:
|
| 142 |
-
|
| 143 |
-
data = await websocket.receive_text()
|
| 144 |
-
await openai_ws.send(data)
|
| 145 |
except WebSocketDisconnect:
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
async def forward_to_client():
|
| 149 |
try:
|
| 150 |
-
|
| 151 |
-
await websocket.send_text(message)
|
| 152 |
except:
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
except Exception as e:
|
| 161 |
print(f"WebSocket error: {e}")
|
|
@@ -164,4 +255,4 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 164 |
|
| 165 |
if __name__ == "__main__":
|
| 166 |
import uvicorn
|
| 167 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 11 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 12 |
from fastapi.responses import HTMLResponse
|
| 13 |
import websockets
|
| 14 |
+
import sys
|
| 15 |
+
sys.dont_write_bytecode = True
|
| 16 |
|
| 17 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 18 |
OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
|
|
|
|
| 31 |
<button id="startBtn">Start Recording</button>
|
| 32 |
<button id="stopBtn" disabled>Stop Recording</button>
|
| 33 |
<div id="status">Status: Ready</div>
|
| 34 |
+
<div><label>Instructions: <input id="instructions" placeholder="Optional prompt" /></label></div>
|
| 35 |
<div id="transcript"></div>
|
| 36 |
+
<audio id="player" controls></audio>
|
| 37 |
|
| 38 |
<script>
|
| 39 |
let mediaRecorder;
|
| 40 |
let audioChunks = [];
|
| 41 |
+
let ws;
|
| 42 |
+
|
| 43 |
+
function ensureWS() {
|
| 44 |
+
if (ws && ws.readyState === WebSocket.OPEN) return ws;
|
| 45 |
+
ws = new WebSocket((location.protocol === 'https:' ? 'wss://' : 'ws://') + location.host + '/ws');
|
| 46 |
+
ws.onopen = () => {
|
| 47 |
+
document.getElementById('status').textContent = 'Status: WS connected';
|
| 48 |
+
};
|
| 49 |
+
ws.onmessage = (event) => {
|
| 50 |
+
try {
|
| 51 |
+
const msg = JSON.parse(event.data);
|
| 52 |
+
if (msg.type === 'transcript_delta') {
|
| 53 |
+
const el = document.getElementById('transcript');
|
| 54 |
+
el.innerHTML = `<strong>Transcript:</strong> ${el.textContent}${msg.text}`;
|
| 55 |
+
} else if (msg.type === 'response_completed') {
|
| 56 |
+
if (msg.audio) {
|
| 57 |
+
const b64 = msg.audio;
|
| 58 |
+
const audioBlob = base64ToWavBlob(b64);
|
| 59 |
+
const url = URL.createObjectURL(audioBlob);
|
| 60 |
+
const player = document.getElementById('player');
|
| 61 |
+
player.src = url;
|
| 62 |
+
player.play();
|
| 63 |
+
}
|
| 64 |
+
document.getElementById('status').textContent = 'Status: Completed';
|
| 65 |
+
}
|
| 66 |
+
} catch {}
|
| 67 |
+
};
|
| 68 |
+
ws.onclose = () => {
|
| 69 |
+
document.getElementById('status').textContent = 'Status: WS closed';
|
| 70 |
+
};
|
| 71 |
+
return ws;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
function base64ToWavBlob(base64) {
|
| 75 |
+
const byteCharacters = atob(base64);
|
| 76 |
+
const byteNumbers = new Array(byteCharacters.length);
|
| 77 |
+
for (let i = 0; i < byteCharacters.length; i++) {
|
| 78 |
+
byteNumbers[i] = byteCharacters.charCodeAt(i);
|
| 79 |
+
}
|
| 80 |
+
const byteArray = new Uint8Array(byteNumbers);
|
| 81 |
+
return new Blob([byteArray], { type: 'audio/wav' });
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
document.getElementById('startBtn').onclick = async () => {
|
| 85 |
+
ensureWS();
|
| 86 |
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
| 87 |
mediaRecorder = new MediaRecorder(stream);
|
| 88 |
|
| 89 |
+
audioChunks = [];
|
| 90 |
+
document.getElementById('transcript').textContent = '';
|
| 91 |
+
|
| 92 |
mediaRecorder.ondataavailable = (event) => {
|
| 93 |
audioChunks.push(event.data);
|
| 94 |
};
|
|
|
|
| 96 |
mediaRecorder.onstop = async () => {
|
| 97 |
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
|
| 98 |
audioChunks = [];
|
|
|
|
|
|
|
| 99 |
const reader = new FileReader();
|
| 100 |
reader.readAsDataURL(audioBlob);
|
| 101 |
reader.onloadend = () => {
|
| 102 |
const base64data = reader.result.split(',')[1];
|
| 103 |
+
const instructions = document.getElementById('instructions').value || '';
|
| 104 |
+
ws.send(JSON.stringify({ type: 'utterance', audio: base64data, instructions }));
|
| 105 |
+
document.getElementById('status').textContent = 'Status: Sending to OpenAI...';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
};
|
| 107 |
};
|
| 108 |
|
|
|
|
| 116 |
mediaRecorder.stop();
|
| 117 |
document.getElementById('startBtn').disabled = false;
|
| 118 |
document.getElementById('stopBtn').disabled = true;
|
|
|
|
| 119 |
};
|
| 120 |
</script>
|
| 121 |
</body>
|
|
|
|
| 175 |
extra_headers=headers
|
| 176 |
) as openai_ws:
|
| 177 |
|
| 178 |
+
async def process_utterance(b64_wav: str, instructions: Optional[str] = None):
|
| 179 |
+
# Append audio buffer
|
| 180 |
+
await openai_ws.send(json.dumps({
|
| 181 |
+
"type": "input_audio_buffer.append",
|
| 182 |
+
"audio": {"data": b64_wav, "format": "wav"}
|
| 183 |
+
}))
|
| 184 |
+
# Commit audio
|
| 185 |
+
await openai_ws.send(json.dumps({
|
| 186 |
+
"type": "input_audio_buffer.commit"
|
| 187 |
+
}))
|
| 188 |
+
# Request response with audio + text
|
| 189 |
+
await openai_ws.send(json.dumps({
|
| 190 |
+
"type": "response.create",
|
| 191 |
+
"response": {
|
| 192 |
+
"modalities": ["audio", "text"],
|
| 193 |
+
"instructions": instructions or ""
|
| 194 |
+
}
|
| 195 |
+
}))
|
| 196 |
+
|
| 197 |
+
audio_chunks = []
|
| 198 |
+
transcript = ""
|
| 199 |
+
# Read stream until completed
|
| 200 |
+
while True:
|
| 201 |
+
msg = await openai_ws.recv()
|
| 202 |
+
try:
|
| 203 |
+
event = json.loads(msg)
|
| 204 |
+
except:
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
etype = event.get("type")
|
| 208 |
+
if etype == "response.audio.delta":
|
| 209 |
+
data = event.get("delta") or event.get("data")
|
| 210 |
+
if data:
|
| 211 |
+
audio_chunks.append(data)
|
| 212 |
+
await websocket.send_text(json.dumps({
|
| 213 |
+
"type": "audio_delta",
|
| 214 |
+
"data": data
|
| 215 |
+
}))
|
| 216 |
+
elif etype == "response.transcript.delta":
|
| 217 |
+
delta = event.get("delta", "")
|
| 218 |
+
transcript += delta
|
| 219 |
+
await websocket.send_text(json.dumps({
|
| 220 |
+
"type": "transcript_delta",
|
| 221 |
+
"text": delta
|
| 222 |
+
}))
|
| 223 |
+
elif etype == "response.completed":
|
| 224 |
+
await websocket.send_text(json.dumps({
|
| 225 |
+
"type": "response_completed",
|
| 226 |
+
"transcript": transcript,
|
| 227 |
+
"audio": "".join(audio_chunks)
|
| 228 |
+
}))
|
| 229 |
+
break
|
| 230 |
+
|
| 231 |
+
# Main loop: receive client messages
|
| 232 |
+
while True:
|
| 233 |
try:
|
| 234 |
+
text = await websocket.receive_text()
|
|
|
|
|
|
|
| 235 |
except WebSocketDisconnect:
|
| 236 |
+
break
|
|
|
|
|
|
|
| 237 |
try:
|
| 238 |
+
msg = json.loads(text)
|
|
|
|
| 239 |
except:
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
mtype = msg.get("type")
|
| 243 |
+
if mtype == "utterance":
|
| 244 |
+
b64_wav = msg.get("audio", "")
|
| 245 |
+
instructions = msg.get("instructions", "")
|
| 246 |
+
if b64_wav:
|
| 247 |
+
await process_utterance(b64_wav, instructions)
|
| 248 |
+
elif mtype == "ping":
|
| 249 |
+
await websocket.send_text(json.dumps({"type": "pong"}))
|
| 250 |
|
| 251 |
except Exception as e:
|
| 252 |
print(f"WebSocket error: {e}")
|
|
|
|
| 255 |
|
| 256 |
if __name__ == "__main__":
|
| 257 |
import uvicorn
|
| 258 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
speech_io.py
CHANGED
|
@@ -8,18 +8,20 @@ Sprachbasierte Ein-/Ausgabe:
|
|
| 8 |
Dieses File ist 100% stabil für HuggingFace Spaces.
|
| 9 |
"""
|
| 10 |
|
| 11 |
-
from typing import Optional, Tuple
|
| 12 |
import numpy as np
|
| 13 |
import soundfile as sf
|
| 14 |
from scipy.signal import butter, filtfilt
|
| 15 |
from transformers import pipeline
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Modelle
|
| 18 |
ASR_MODEL_ID = "openai/whisper-small"
|
| 19 |
TTS_MODEL_ID = "facebook/mms-tts-deu"
|
| 20 |
|
| 21 |
_asr = None
|
| 22 |
-
|
| 23 |
|
| 24 |
# ========================================================
|
| 25 |
# STT PIPELINE
|
|
@@ -42,15 +44,15 @@ def get_asr_pipeline():
|
|
| 42 |
# TTS PIPELINE
|
| 43 |
# ========================================================
|
| 44 |
|
| 45 |
-
def get_tts_pipeline():
|
| 46 |
-
|
| 47 |
-
if
|
| 48 |
-
print(f">>> Lade TTS Modell: {
|
| 49 |
-
|
| 50 |
task="text-to-speech",
|
| 51 |
-
model=
|
| 52 |
)
|
| 53 |
-
return
|
| 54 |
|
| 55 |
# ========================================================
|
| 56 |
# AUDIO FILTER – Noise Reduction + Highpass
|
|
@@ -76,6 +78,42 @@ def apply_fade(audio, sr, duration_ms=10):
|
|
| 76 |
|
| 77 |
return audio
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
# ========================================================
|
| 80 |
# SPEECH-TO-TEXT (STT)
|
| 81 |
# ========================================================
|
|
@@ -88,24 +126,11 @@ def transcribe_audio(audio_path: str) -> str:
|
|
| 88 |
if audio_path is None:
|
| 89 |
return ""
|
| 90 |
|
| 91 |
-
# WAV einlesen (soundfile garantiert PCM korrekt)
|
| 92 |
data, sr = sf.read(audio_path)
|
| 93 |
-
|
| 94 |
-
# immer Mono
|
| 95 |
-
if len(data.shape) > 1:
|
| 96 |
-
data = data[:, 0]
|
| 97 |
-
|
| 98 |
-
# Whisper >30s vermeiden
|
| 99 |
-
MAX_SAMPLES = sr * 30
|
| 100 |
-
if len(data) > MAX_SAMPLES:
|
| 101 |
-
data = data[:MAX_SAMPLES]
|
| 102 |
-
|
| 103 |
asr = get_asr_pipeline()
|
| 104 |
-
|
| 105 |
print(">>> Transkribiere Audio...")
|
| 106 |
-
result = asr(
|
| 107 |
-
{"array": data, "sampling_rate": sr},
|
| 108 |
-
)
|
| 109 |
|
| 110 |
text = result.get("text", "").strip()
|
| 111 |
print("ASR:", text)
|
|
@@ -115,11 +140,11 @@ def transcribe_audio(audio_path: str) -> str:
|
|
| 115 |
# TEXT-TO-SPEECH (TTS)
|
| 116 |
# ========================================================
|
| 117 |
|
| 118 |
-
def synthesize_speech(text: str):
|
| 119 |
if not text or not text.strip():
|
| 120 |
return None
|
| 121 |
|
| 122 |
-
tts = get_tts_pipeline()
|
| 123 |
out = tts(text)
|
| 124 |
|
| 125 |
# rohes Audio from MMS (float32 [-1, 1])
|
|
@@ -155,4 +180,3 @@ def synthesize_speech(text: str):
|
|
| 155 |
|
| 156 |
# Rückgabe: (sr, np.int16 array)
|
| 157 |
return (sr, audio_int16)
|
| 158 |
-
|
|
|
|
| 8 |
Dieses File ist 100% stabil für HuggingFace Spaces.
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
from typing import Optional, Tuple, Dict
|
| 12 |
import numpy as np
|
| 13 |
import soundfile as sf
|
| 14 |
from scipy.signal import butter, filtfilt
|
| 15 |
from transformers import pipeline
|
| 16 |
+
import librosa
|
| 17 |
+
import webrtcvad
|
| 18 |
|
| 19 |
# Modelle
|
| 20 |
ASR_MODEL_ID = "openai/whisper-small"
|
| 21 |
TTS_MODEL_ID = "facebook/mms-tts-deu"
|
| 22 |
|
| 23 |
_asr = None
|
| 24 |
+
_tts_cache: Dict[str, any] = {}
|
| 25 |
|
| 26 |
# ========================================================
|
| 27 |
# STT PIPELINE
|
|
|
|
| 44 |
# TTS PIPELINE
|
| 45 |
# ========================================================
|
| 46 |
|
| 47 |
+
def get_tts_pipeline(model_id: Optional[str] = None):
|
| 48 |
+
mid = model_id or TTS_MODEL_ID
|
| 49 |
+
if mid not in _tts_cache:
|
| 50 |
+
print(f">>> Lade TTS Modell: {mid}")
|
| 51 |
+
_tts_cache[mid] = pipeline(
|
| 52 |
task="text-to-speech",
|
| 53 |
+
model=mid,
|
| 54 |
)
|
| 55 |
+
return _tts_cache[mid]
|
| 56 |
|
| 57 |
# ========================================================
|
| 58 |
# AUDIO FILTER – Noise Reduction + Highpass
|
|
|
|
| 78 |
|
| 79 |
return audio
|
| 80 |
|
| 81 |
+
def _vad_trim(audio16: np.ndarray, sr: int) -> np.ndarray:
|
| 82 |
+
vad = webrtcvad.Vad(2)
|
| 83 |
+
frame_ms = 30
|
| 84 |
+
frame_len = int(sr * frame_ms / 1000)
|
| 85 |
+
if frame_len <= 0:
|
| 86 |
+
return audio16
|
| 87 |
+
start = 0
|
| 88 |
+
end = len(audio16)
|
| 89 |
+
voiced = []
|
| 90 |
+
i = 0
|
| 91 |
+
while i + frame_len <= len(audio16):
|
| 92 |
+
frame = audio16[i:i+frame_len]
|
| 93 |
+
is_voiced = vad.is_speech(frame.tobytes(), sample_rate=sr)
|
| 94 |
+
voiced.append(is_voiced)
|
| 95 |
+
i += frame_len
|
| 96 |
+
first = next((idx for idx, v in enumerate(voiced) if v), None)
|
| 97 |
+
last = next((len(voiced)-1-idx for idx, v in enumerate(reversed(voiced)) if v), None)
|
| 98 |
+
if first is None or last is None or last < first:
|
| 99 |
+
return audio16
|
| 100 |
+
start = first * frame_len
|
| 101 |
+
end = min((last + 1) * frame_len, len(audio16))
|
| 102 |
+
return audio16[start:end]
|
| 103 |
+
|
| 104 |
+
def preprocess_audio_for_stt(raw: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
|
| 105 |
+
if raw.ndim > 1:
|
| 106 |
+
raw = raw[:, 0]
|
| 107 |
+
y = librosa.to_mono(raw.astype(np.float32))
|
| 108 |
+
y = librosa.resample(y, orig_sr=sr, target_sr=16000)
|
| 109 |
+
y = y / (np.max(np.abs(y)) + 1e-9)
|
| 110 |
+
y16 = np.clip(y * 32767, -32768, 32767).astype(np.int16)
|
| 111 |
+
y16 = _vad_trim(y16, 16000)
|
| 112 |
+
max_samples = 16000 * 30
|
| 113 |
+
if len(y16) > max_samples:
|
| 114 |
+
y16 = y16[:max_samples]
|
| 115 |
+
return y16.astype(np.float32) / 32767.0, 16000
|
| 116 |
+
|
| 117 |
# ========================================================
|
| 118 |
# SPEECH-TO-TEXT (STT)
|
| 119 |
# ========================================================
|
|
|
|
| 126 |
if audio_path is None:
|
| 127 |
return ""
|
| 128 |
|
|
|
|
| 129 |
data, sr = sf.read(audio_path)
|
| 130 |
+
data, sr = preprocess_audio_for_stt(data, sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
asr = get_asr_pipeline()
|
|
|
|
| 132 |
print(">>> Transkribiere Audio...")
|
| 133 |
+
result = asr({"array": data, "sampling_rate": sr})
|
|
|
|
|
|
|
| 134 |
|
| 135 |
text = result.get("text", "").strip()
|
| 136 |
print("ASR:", text)
|
|
|
|
| 140 |
# TEXT-TO-SPEECH (TTS)
|
| 141 |
# ========================================================
|
| 142 |
|
| 143 |
+
def synthesize_speech(text: str, tts_model_id: Optional[str] = None):
|
| 144 |
if not text or not text.strip():
|
| 145 |
return None
|
| 146 |
|
| 147 |
+
tts = get_tts_pipeline(tts_model_id)
|
| 148 |
out = tts(text)
|
| 149 |
|
| 150 |
# rohes Audio from MMS (float32 [-1, 1])
|
|
|
|
| 180 |
|
| 181 |
# Rückgabe: (sr, np.int16 array)
|
| 182 |
return (sr, audio_int16)
|
|
|