reduce buffer duration and adjust transcription parameters for lower latency
Browse files- __pycache__/app.cpython-38.pyc +0 -0
- app.py +7 -6
__pycache__/app.cpython-38.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-38.pyc and b/__pycache__/app.cpython-38.pyc differ
|
|
|
app.py
CHANGED
|
@@ -43,7 +43,8 @@ def get_device_and_dtype():
|
|
| 43 |
# Globals / constants
|
| 44 |
# -------------------------
|
| 45 |
SAMPLE_RATE = 16000
|
| 46 |
-
|
|
|
|
| 47 |
|
| 48 |
# VAD (webrtcvad)
|
| 49 |
vad = webrtcvad.Vad(2) # aggressiveness 0-3 (tune as needed)
|
|
@@ -214,8 +215,8 @@ def stream_transcribe(audio, state):
|
|
| 214 |
buffer = buffer[-max_len:]
|
| 215 |
state["buffer"] = buffer
|
| 216 |
|
| 217 |
-
# If buffer too short, wait
|
| 218 |
-
if buffer.shape[0] < int(0.
|
| 219 |
return full_transcript, state
|
| 220 |
|
| 221 |
# --- VAD: find voiced segments and pick the latest one ---
|
|
@@ -231,7 +232,7 @@ def stream_transcribe(audio, state):
|
|
| 231 |
segment_audio = buffer[s:e]
|
| 232 |
|
| 233 |
# If segment too short skip
|
| 234 |
-
if len(segment_audio) < int(0.
|
| 235 |
return full_transcript, state
|
| 236 |
|
| 237 |
# Process ONLY the voiced segment
|
|
@@ -242,8 +243,8 @@ def stream_transcribe(audio, state):
|
|
| 242 |
with torch.no_grad():
|
| 243 |
predicted_ids = model.generate(
|
| 244 |
input_features,
|
| 245 |
-
max_new_tokens=
|
| 246 |
-
num_beams=
|
| 247 |
no_repeat_ngram_size=4, # block repeated loops
|
| 248 |
repetition_penalty=1.3, # punish repeated phrasing
|
| 249 |
length_penalty=0.7, # prefer shorter, factual outputs
|
|
|
|
| 43 |
# Globals / constants
|
| 44 |
# -------------------------
|
| 45 |
SAMPLE_RATE = 16000
|
| 46 |
+
# Keep a shorter rolling buffer to reduce model input length and latency.
|
| 47 |
+
BUFFER_DURATION = 4 # seconds
|
| 48 |
|
| 49 |
# VAD (webrtcvad)
|
| 50 |
vad = webrtcvad.Vad(2) # aggressiveness 0-3 (tune as needed)
|
|
|
|
| 215 |
buffer = buffer[-max_len:]
|
| 216 |
state["buffer"] = buffer
|
| 217 |
|
| 218 |
+
# If buffer too short, wait (very short context tends to be unstable)
|
| 219 |
+
if buffer.shape[0] < int(0.3 * SAMPLE_RATE):
|
| 220 |
return full_transcript, state
|
| 221 |
|
| 222 |
# --- VAD: find voiced segments and pick the latest one ---
|
|
|
|
| 232 |
segment_audio = buffer[s:e]
|
| 233 |
|
| 234 |
# If segment too short skip
|
| 235 |
+
if len(segment_audio) < int(0.15 * SAMPLE_RATE):
|
| 236 |
return full_transcript, state
|
| 237 |
|
| 238 |
# Process ONLY the voiced segment
|
|
|
|
| 243 |
with torch.no_grad():
|
| 244 |
predicted_ids = model.generate(
|
| 245 |
input_features,
|
| 246 |
+
max_new_tokens=64, # shorter prevents long continuations (lower latency)
|
| 247 |
+
num_beams=1, # single beam for faster decoding
|
| 248 |
no_repeat_ngram_size=4, # block repeated loops
|
| 249 |
repetition_penalty=1.3, # punish repeated phrasing
|
| 250 |
length_penalty=0.7, # prefer shorter, factual outputs
|