mic3333 commited on
Commit
8493d7f
·
1 Parent(s): 649d6e6

reduce buffer duration and adjust transcription parameters for lower latency

Browse files
Files changed (2) hide show
  1. __pycache__/app.cpython-38.pyc +0 -0
  2. app.py +7 -6
__pycache__/app.cpython-38.pyc CHANGED
Binary files a/__pycache__/app.cpython-38.pyc and b/__pycache__/app.cpython-38.pyc differ
 
app.py CHANGED
@@ -43,7 +43,8 @@ def get_device_and_dtype():
43
  # Globals / constants
44
  # -------------------------
45
  SAMPLE_RATE = 16000
46
- BUFFER_DURATION = 6 # seconds
 
47
 
48
  # VAD (webrtcvad)
49
  vad = webrtcvad.Vad(2) # aggressiveness 0-3 (tune as needed)
@@ -214,8 +215,8 @@ def stream_transcribe(audio, state):
214
  buffer = buffer[-max_len:]
215
  state["buffer"] = buffer
216
 
217
- # If buffer too short, wait
218
- if buffer.shape[0] < int(0.5 * SAMPLE_RATE):
219
  return full_transcript, state
220
 
221
  # --- VAD: find voiced segments and pick the latest one ---
@@ -231,7 +232,7 @@ def stream_transcribe(audio, state):
231
  segment_audio = buffer[s:e]
232
 
233
  # If segment too short skip
234
- if len(segment_audio) < int(0.25 * SAMPLE_RATE):
235
  return full_transcript, state
236
 
237
  # Process ONLY the voiced segment
@@ -242,8 +243,8 @@ def stream_transcribe(audio, state):
242
  with torch.no_grad():
243
  predicted_ids = model.generate(
244
  input_features,
245
- max_new_tokens=128, # shorter prevents hallucinated continuation
246
- num_beams=5, # stronger beam search for stability
247
  no_repeat_ngram_size=4, # block repeated loops
248
  repetition_penalty=1.3, # punish repeated phrasing
249
  length_penalty=0.7, # prefer shorter, factual outputs
 
43
  # Globals / constants
44
  # -------------------------
45
  SAMPLE_RATE = 16000
46
+ # Keep a shorter rolling buffer to reduce model input length and latency.
47
+ BUFFER_DURATION = 4 # seconds
48
 
49
  # VAD (webrtcvad)
50
  vad = webrtcvad.Vad(2) # aggressiveness 0-3 (tune as needed)
 
215
  buffer = buffer[-max_len:]
216
  state["buffer"] = buffer
217
 
218
+ # If buffer too short, wait (very short context tends to be unstable)
219
+ if buffer.shape[0] < int(0.3 * SAMPLE_RATE):
220
  return full_transcript, state
221
 
222
  # --- VAD: find voiced segments and pick the latest one ---
 
232
  segment_audio = buffer[s:e]
233
 
234
  # If segment too short skip
235
+ if len(segment_audio) < int(0.15 * SAMPLE_RATE):
236
  return full_transcript, state
237
 
238
  # Process ONLY the voiced segment
 
243
  with torch.no_grad():
244
  predicted_ids = model.generate(
245
  input_features,
246
+ max_new_tokens=64, # shorter prevents long continuations (lower latency)
247
+ num_beams=1, # single beam for faster decoding
248
  no_repeat_ngram_size=4, # block repeated loops
249
  repetition_penalty=1.3, # punish repeated phrasing
250
  length_penalty=0.7, # prefer shorter, factual outputs