| """ |
| realtime_server.py - Optional WebSocket server for real-time audio streaming |
| Chạy riêng biệt: uvicorn realtime_server:app --host 0.0.0.0 --port 8000 |
| """ |
|
|
| import os |
| import asyncio |
| import json |
| import base64 |
| from typing import Optional |
| from fastapi import FastAPI, WebSocket, WebSocketDisconnect |
| from fastapi.responses import HTMLResponse |
| import websockets |
| import sys |
| sys.dont_write_bytecode = True |
|
|
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") |
| OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview") |
|
|
| app = FastAPI() |
|
|
| |
| html = """ |
| <!DOCTYPE html> |
| <html> |
| <head> |
| <title>Realtime Audio Test</title> |
| </head> |
| <body> |
| <h1>Realtime Audio Test</h1> |
| <button id="startBtn">Start Recording</button> |
| <button id="stopBtn" disabled>Stop Recording</button> |
| <div id="status">Status: Ready</div> |
| <div><label>Instructions: <input id="instructions" placeholder="Optional prompt" /></label></div> |
| <div id="transcript"></div> |
| <audio id="player" controls></audio> |
| |
| <script> |
| let mediaRecorder; |
| let audioChunks = []; |
| let ws; |
| |
| function ensureWS() { |
| if (ws && ws.readyState === WebSocket.OPEN) return ws; |
| const basePath = (location.pathname.endsWith('/') ? location.pathname.slice(0,-1) : location.pathname); |
| ws = new WebSocket((location.protocol === 'https:' ? 'wss://' : 'ws://') + location.host + basePath + '/ws'); |
| ws.onopen = () => { |
| document.getElementById('status').textContent = 'Status: WS connected'; |
| }; |
| ws.onmessage = (event) => { |
| try { |
| const msg = JSON.parse(event.data); |
| if (msg.type === 'transcript_delta') { |
| const el = document.getElementById('transcript'); |
| el.innerHTML = `<strong>Transcript:</strong> ${el.textContent}${msg.text}`; |
| } else if (msg.type === 'response_completed') { |
| if (msg.audio) { |
| const b64 = msg.audio; |
| const audioBlob = base64ToWavBlob(b64); |
| const url = URL.createObjectURL(audioBlob); |
| const player = document.getElementById('player'); |
| player.src = url; |
| player.play(); |
| } |
| document.getElementById('status').textContent = 'Status: Completed'; |
| } |
| } catch {} |
| }; |
| ws.onclose = () => { |
| document.getElementById('status').textContent = 'Status: WS closed'; |
| }; |
| return ws; |
| } |
| |
| function base64ToWavBlob(base64) { |
| const byteCharacters = atob(base64); |
| const byteNumbers = new Array(byteCharacters.length); |
| for (let i = 0; i < byteCharacters.length; i++) { |
| byteNumbers[i] = byteCharacters.charCodeAt(i); |
| } |
| const byteArray = new Uint8Array(byteNumbers); |
| return new Blob([byteArray], { type: 'audio/wav' }); |
| } |
| |
| document.getElementById('startBtn').onclick = async () => { |
| ensureWS(); |
| const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
| mediaRecorder = new MediaRecorder(stream); |
| |
| audioChunks = []; |
| document.getElementById('transcript').textContent = ''; |
| |
| mediaRecorder.ondataavailable = (event) => { |
| audioChunks.push(event.data); |
| }; |
| |
| mediaRecorder.onstop = async () => { |
| const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); |
| audioChunks = []; |
| const reader = new FileReader(); |
| reader.readAsDataURL(audioBlob); |
| reader.onloadend = () => { |
| const base64data = reader.result.split(',')[1]; |
| const instructions = document.getElementById('instructions').value || ''; |
| ws.send(JSON.stringify({ type: 'utterance', audio: base64data, instructions })); |
| document.getElementById('status').textContent = 'Status: Sending to OpenAI...'; |
| }; |
| }; |
| |
| mediaRecorder.start(); |
| document.getElementById('startBtn').disabled = true; |
| document.getElementById('stopBtn').disabled = false; |
| document.getElementById('status').textContent = 'Status: Recording...'; |
| }; |
| |
| document.getElementById('stopBtn').onclick = () => { |
| mediaRecorder.stop(); |
| document.getElementById('startBtn').disabled = false; |
| document.getElementById('stopBtn').disabled = true; |
| }; |
| </script> |
| </body> |
| </html> |
| """ |
|
|
| @app.get("/") |
| async def get(): |
| return HTMLResponse(html) |
|
|
| @app.post("/process-audio") |
| async def process_audio(request: dict): |
| """Process audio from frontend""" |
| try: |
| audio_data = base64.b64decode(request.get("audio", "")) |
| |
| |
| import tempfile |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
| f.write(audio_data) |
| temp_path = f.name |
| |
| |
| from openai import OpenAI |
| client = OpenAI(api_key=OPENAI_API_KEY) |
| |
| with open(temp_path, "rb") as audio_file: |
| transcript = client.audio.transcriptions.create( |
| model="whisper-1", |
| file=audio_file, |
| language="de" |
| ) |
| |
| |
| import os |
| os.unlink(temp_path) |
| |
| return {"success": True, "transcript": transcript.text} |
| |
| except Exception as e: |
| return {"success": False, "error": str(e)} |
|
|
| @app.websocket("/ws") |
| async def websocket_endpoint(websocket: WebSocket): |
| """WebSocket endpoint for real-time audio streaming""" |
| await websocket.accept() |
| |
| try: |
| |
| headers = { |
| "Authorization": f"Bearer {OPENAI_API_KEY}", |
| "OpenAI-Beta": "realtime=v1", |
| } |
| |
| async with websockets.connect( |
| f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}", |
| extra_headers=headers |
| ) as openai_ws: |
| |
| async def process_utterance(b64_wav: str, instructions: Optional[str] = None): |
| |
| await openai_ws.send(json.dumps({ |
| "type": "input_audio_buffer.append", |
| "audio": {"data": b64_wav, "format": "wav"} |
| })) |
| |
| await openai_ws.send(json.dumps({ |
| "type": "input_audio_buffer.commit" |
| })) |
| |
| await openai_ws.send(json.dumps({ |
| "type": "response.create", |
| "response": { |
| "modalities": ["audio", "text"], |
| "instructions": instructions or "" |
| } |
| })) |
|
|
| audio_chunks = [] |
| transcript = "" |
| |
| while True: |
| msg = await openai_ws.recv() |
| try: |
| event = json.loads(msg) |
| except: |
| continue |
|
|
| etype = event.get("type") |
| if etype == "response.audio.delta": |
| data = event.get("delta") or event.get("data") |
| if data: |
| audio_chunks.append(data) |
| await websocket.send_text(json.dumps({ |
| "type": "audio_delta", |
| "data": data |
| })) |
| elif etype == "response.transcript.delta": |
| delta = event.get("delta", "") |
| transcript += delta |
| await websocket.send_text(json.dumps({ |
| "type": "transcript_delta", |
| "text": delta |
| })) |
| elif etype == "response.completed": |
| await websocket.send_text(json.dumps({ |
| "type": "response_completed", |
| "transcript": transcript, |
| "audio": "".join(audio_chunks) |
| })) |
| break |
|
|
| |
| while True: |
| try: |
| text = await websocket.receive_text() |
| except WebSocketDisconnect: |
| break |
| try: |
| msg = json.loads(text) |
| except: |
| continue |
|
|
| mtype = msg.get("type") |
| if mtype == "utterance": |
| b64_wav = msg.get("audio", "") |
| instructions = msg.get("instructions", "") |
| if b64_wav: |
| await process_utterance(b64_wav, instructions) |
| elif mtype == "ping": |
| await websocket.send_text(json.dumps({"type": "pong"})) |
| |
| except Exception as e: |
| print(f"WebSocket error: {e}") |
| finally: |
| await websocket.close() |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=8000) |
|
|