boffire commited on
Commit
120f10a
·
verified ·
1 Parent(s): 5aac2d2

Update src/gradio_app.py

Browse files
Files changed (1) hide show
  1. src/gradio_app.py +157 -45
src/gradio_app.py CHANGED
@@ -6,6 +6,8 @@ import tempfile
6
  import re as regex
7
  import glob
8
  import random
 
 
9
 
10
  # --- Configuration ---
11
  MAX_SIZE_MB = "50"
@@ -18,12 +20,105 @@ DATASET_REPO = "boffire/kabyle-synth-voice"
18
  DATASET_AUDIO_BASE_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/audio"
19
  DATASET_API_TREE_URL = f"https://huggingface.co/api/datasets/{DATASET_REPO}/tree/main/audio"
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # --- Translation Logic ---
22
  def translate_to_english(text):
23
  if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
24
  return ""
 
 
25
  payload = {
26
- 'q': text,
27
  'source': 'kab',
28
  'target': 'en',
29
  'format': 'text',
@@ -50,7 +145,6 @@ def get_dataset_audio_files():
50
  resp = requests.get(DATASET_API_TREE_URL, timeout=15)
51
  resp.raise_for_status()
52
  items = resp.json()
53
- # Filter only .wav files and extract filenames
54
  files = [
55
  item["path"].replace("audio/", "")
56
  for item in items
@@ -72,7 +166,6 @@ def download_random_dataset_sample() -> str:
72
  tmp_dir = tempfile.gettempdir()
73
  local_path = os.path.join(tmp_dir, f"dataset_{filename}")
74
 
75
- # Download the file
76
  try:
77
  resp = requests.get(file_url, timeout=30, stream=True)
78
  resp.raise_for_status()
@@ -91,62 +184,67 @@ def format_transcript(text: str) -> str:
91
  text = text.strip()
92
  if not text:
93
  return text
94
- # Capitalize first letter
95
  text = text[0].upper() + text[1:]
96
- # Add trailing period if missing and last char is not already punctuation
97
  if text and text[-1] not in ".!?":
98
  text += "."
99
  return text
100
 
101
- def process_audio(audio_file):
102
- """Handles validation -> Transcription -> Translation."""
103
  if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
104
- return "⚠️ Please upload an audio file first.", ""
105
 
106
  if isinstance(audio_file, str):
107
  try:
108
  info = sf.info(audio_file)
109
  if info.duration > MAX_SECONDS:
110
- return f"❌ Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", ""
111
  except Exception as e:
112
- return f"❌ Error reading audio info: {str(e)}", ""
113
 
114
  try:
115
  from inference_file import inference
116
  transcript = inference(audio_file)
117
  transcript = format_transcript(transcript)
118
- translation = translate_to_english(transcript)
119
- return transcript, translation
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
- return f"❌ Error during processing: {str(e)}", ""
122
 
123
  def process_random_dataset():
124
  """Downloads a random sample from the dataset and runs ASR."""
125
  try:
126
  audio_path = download_random_dataset_sample()
127
  except Exception as e:
128
- return None, f"❌ Dataset Error: {str(e)}", ""
129
 
130
- transcript, translation = process_audio(audio_path)
131
-
132
- # Cleanup temp file
133
- try:
134
- if os.path.exists(audio_path):
135
- os.remove(audio_path)
136
- except Exception:
137
- pass
138
 
139
- return audio_path, transcript, translation
 
140
 
141
  # --- Build Gradio UI ---
142
  with gr.Blocks(title="🎙️ Mmeslay") as demo:
143
  gr.Markdown(
144
  """
145
  # 🎙️ Mmeslay by [G1ya777](https://github.com/G1ya777/Mmeslay)
146
- ### Kabyle ASR & Translation
147
- *Powered by Squeezeformer (ASR) and LibreTranslate (NMT)*
148
 
149
- Upload a Kabyle audio file, record directly, **or pick a random sample** from the Kabyle Synth Voice dataset to get a transcript and English translation.
150
  """
151
  )
152
 
@@ -159,20 +257,35 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
159
  sources=["upload", "microphone"],
160
  format="mp3",
161
  )
162
- transcribe_btn = gr.Button("🚀 Transcribe & Translate", variant="primary", size="lg")
 
 
 
 
 
163
 
164
  with gr.Column(scale=2):
165
- text_output_1 = gr.Textbox(label="Transcription (Kabyle)", lines=5)
 
 
 
 
 
 
 
 
 
 
166
  translation_output_1 = gr.Textbox(
167
  label="LibreTranslate (English)",
168
- lines=5,
169
- placeholder="English LibreTranslate translation will appear here..."
170
  )
171
 
172
  transcribe_btn.click(
173
  fn=process_audio,
174
- inputs=audio_input,
175
- outputs=[text_output_1, translation_output_1],
176
  )
177
 
178
  gr.Examples(
@@ -197,34 +310,32 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
197
 
198
  with gr.Column(scale=2):
199
  random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
200
- text_output_3 = gr.Textbox(label="Transcription (Kabyle)", lines=5)
 
 
201
  translation_output_3 = gr.Textbox(
202
  label="LibreTranslate (English)",
203
- lines=5,
204
  placeholder="English LibreTranslate translation will appear here..."
205
  )
206
 
207
  def process_random_with_status():
208
- # Update status
209
- yield "⏳ Fetching random sample...", None, "", ""
210
  try:
211
  audio_path = download_random_dataset_sample()
212
  except Exception as e:
213
- yield f"❌ Dataset Error: {str(e)}", None, "", ""
214
  return
215
 
216
- yield "⏳ Transcribing...", audio_path, "", ""
217
- transcript, translation = process_audio(audio_path)
218
 
219
- # Note: we keep audio_path in the output so the user can listen,
220
- # but we don't delete it here — Gradio needs the file to serve it.
221
- # It will be cleaned up by the OS temp cleanup eventually.
222
- yield "✅ Done!", audio_path, transcript, translation
223
 
224
  random_btn.click(
225
  fn=process_random_with_status,
226
  inputs=[],
227
- outputs=[dataset_status, random_audio_player, text_output_3, translation_output_3],
228
  )
229
 
230
  gr.Markdown(
@@ -232,6 +343,7 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
232
  ---
233
  Developed by [G1ya777](https://github.com/G1ya777/Mmeslay).
234
  Examples from Tatoeba (CC BY licenses).
 
235
  """
236
  )
237
 
@@ -244,4 +356,4 @@ if __name__ == "__main__":
244
  server_port=port,
245
  max_file_size=f"{MAX_SIZE_MB}mb",
246
  theme=gr.themes.Soft(),
247
- )
 
6
  import re as regex
7
  import glob
8
  import random
9
+ import difflib
10
+ from spylls.hunspell import Dictionary
11
 
12
  # --- Configuration ---
13
  MAX_SIZE_MB = "50"
 
20
  DATASET_AUDIO_BASE_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/audio"
21
  DATASET_API_TREE_URL = f"https://huggingface.co/api/datasets/{DATASET_REPO}/tree/main/audio"
22
 
23
+ # --- Hunspell Dictionary Configuration ---
24
+ DICT_DIR = os.path.join(os.path.dirname(__file__), "dicts")
25
+ AFF_PATH = os.path.join(DICT_DIR, "kab.aff")
26
+ DIC_PATH = os.path.join(DICT_DIR, "kab.dic")
27
+
28
+ _hunspell_dict = None
29
+
30
+ def get_hunspell():
31
+ """Lazy-load the Hunspell dictionary."""
32
+ global _hunspell_dict
33
+ if _hunspell_dict is None:
34
+ if not os.path.exists(AFF_PATH) or not os.path.exists(DIC_PATH):
35
+ raise FileNotFoundError(
36
+ f"Dictionnaire Hunspell kabyle non trouvé.\n"
37
+ f"Attendu: {AFF_PATH} et {DIC_PATH}\n"
38
+ f"Veuillez uploader les fichiers kab.aff et kab.dic dans le dossier 'dicts/' de votre Space."
39
+ )
40
+ _hunspell_dict = Dictionary.from_files(AFF_PATH, DIC_PATH)
41
+ return _hunspell_dict
42
+
43
+ def correct_word(word: str) -> str:
44
+ """Corrige un mot unique avec Hunspell. Retourne le mot original s'il est correct ou sans suggestion fiable."""
45
+ dic = get_hunspell()
46
+
47
+ # Nettoyage: séparer ponctuation
48
+ stripped = word.strip(".,!?;:"'()[]{}«»—–-").lower()
49
+ if not stripped:
50
+ return word
51
+
52
+ # Vérifier si le mot est valide
53
+ if dic.lookup(stripped):
54
+ return word # mot correct, on garde la forme originale
55
+
56
+ # Récupérer les suggestions
57
+ suggestions = list(dic.suggest(stripped))
58
+ if not suggestions:
59
+ return word # pas de suggestion, on garde l'erreur ASR
60
+
61
+ # Choisir la meilleure suggestion par similarité (Levenshtein-like via SequenceMatcher)
62
+ best = max(suggestions, key=lambda s: difflib.SequenceMatcher(None, stripped, s).ratio())
63
+
64
+ # Vérifier que la suggestion est suffisamment proche (éviter les substitutions totalement différentes)
65
+ similarity = difflib.SequenceMatcher(None, stripped, best).ratio()
66
+ if similarity < 0.5:
67
+ return word # suggestion trop éloignée, on garde l'original
68
+
69
+ # Restaurer la casse originale
70
+ if word[0].isupper():
71
+ best = best[0].upper() + best[1:]
72
+
73
+ # Restaurer la ponctuation attachée
74
+ prefix_len = len(word) - len(word.lstrip(".,!?;:"'()[]{}«»—–-"))
75
+ suffix_len = len(word) - len(word.rstrip(".,!?;:"'()[]{}«»—–-"))
76
+ prefix = word[:prefix_len]
77
+ suffix = word[-suffix_len:] if suffix_len > 0 else ""
78
+
79
+ return prefix + best + suffix
80
+
81
+ def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, list[dict]]:
82
+ """
83
+ Vérifie la transcription mot par mot avec Hunspell.
84
+ Retourne: (texte_corrigé, liste_des_corrections_appliquées)
85
+ """
86
+ if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
87
+ return text, []
88
+
89
+ # Tokenisation simple: séparer par espaces tout en préservant la ponctuation attachée
90
+ words = text.split()
91
+ corrected_words = []
92
+ corrections = []
93
+
94
+ for i, word in enumerate(words):
95
+ if auto_correct:
96
+ corrected = correct_word(word)
97
+ else:
98
+ # Mode suggestion seule: on ne corrige pas, on signale juste
99
+ dic = get_hunspell()
100
+ stripped = word.strip(".,!?;:"'()[]{}«»—–-").lower()
101
+ corrected = word if (not stripped or dic.lookup(stripped)) else word + " [?]"
102
+
103
+ corrected_words.append(corrected)
104
+
105
+ if corrected != word:
106
+ corrections.append({
107
+ "position": i,
108
+ "original": word,
109
+ "suggestion": corrected
110
+ })
111
+
112
+ return " ".join(corrected_words), corrections
113
+
114
  # --- Translation Logic ---
115
  def translate_to_english(text):
116
  if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
117
  return ""
118
+ # Nettoyer le texte des marqueurs de correction avant traduction
119
+ clean_text = regex.sub(r"\s*\[\?\]", "", text)
120
  payload = {
121
+ 'q': clean_text,
122
  'source': 'kab',
123
  'target': 'en',
124
  'format': 'text',
 
145
  resp = requests.get(DATASET_API_TREE_URL, timeout=15)
146
  resp.raise_for_status()
147
  items = resp.json()
 
148
  files = [
149
  item["path"].replace("audio/", "")
150
  for item in items
 
166
  tmp_dir = tempfile.gettempdir()
167
  local_path = os.path.join(tmp_dir, f"dataset_{filename}")
168
 
 
169
  try:
170
  resp = requests.get(file_url, timeout=30, stream=True)
171
  resp.raise_for_status()
 
184
  text = text.strip()
185
  if not text:
186
  return text
 
187
  text = text[0].upper() + text[1:]
 
188
  if text and text[-1] not in ".!?":
189
  text += "."
190
  return text
191
 
192
+ def process_audio(audio_file, apply_spellcheck=True):
193
+ """Handles validation -> Transcription -> Spellcheck -> Translation."""
194
  if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
195
+ return "⚠️ Please upload an audio file first.", "", ""
196
 
197
  if isinstance(audio_file, str):
198
  try:
199
  info = sf.info(audio_file)
200
  if info.duration > MAX_SECONDS:
201
+ return f"❌ Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", "", ""
202
  except Exception as e:
203
+ return f"❌ Error reading audio info: {str(e)}", "", ""
204
 
205
  try:
206
  from inference_file import inference
207
  transcript = inference(audio_file)
208
  transcript = format_transcript(transcript)
209
+
210
+ # --- NOUVEAU: Spellcheck ---
211
+ spellchecked = transcript
212
+ corrections = []
213
+ if apply_spellcheck:
214
+ try:
215
+ spellchecked, corrections = spellcheck_transcript(transcript, auto_correct=True)
216
+ except FileNotFoundError:
217
+ spellchecked = transcript + "\n\n⚠️ Dictionnaire Hunspell non trouvé — correction orthographique désactivée."
218
+ except Exception as e:
219
+ spellchecked = transcript + f"\n\n⚠️ Erreur Hunspell: {str(e)}"
220
+
221
+ translation = translate_to_english(spellchecked)
222
+
223
+ return transcript, spellchecked, translation
224
  except Exception as e:
225
+ return f"❌ Error during processing: {str(e)}", "", ""
226
 
227
  def process_random_dataset():
228
  """Downloads a random sample from the dataset and runs ASR."""
229
  try:
230
  audio_path = download_random_dataset_sample()
231
  except Exception as e:
232
+ return None, f"❌ Dataset Error: {str(e)}", "", ""
233
 
234
+ transcript, spellchecked, translation = process_audio(audio_path)
 
 
 
 
 
 
 
235
 
236
+ # Note: on ne supprime pas le fichier ici car Gradio en a besoin pour le lecteur audio
237
+ return audio_path, transcript, spellchecked, translation
238
 
239
  # --- Build Gradio UI ---
240
  with gr.Blocks(title="🎙️ Mmeslay") as demo:
241
  gr.Markdown(
242
  """
243
  # 🎙️ Mmeslay by [G1ya777](https://github.com/G1ya777/Mmeslay)
244
+ ### Kabyle ASR, Spellcheck & Translation
245
+ *Powered by Squeezeformer (ASR), Hunspell (Spellcheck) and LibreTranslate (NMT)*
246
 
247
+ Upload a Kabyle audio file, record directly, **or pick a random sample** from the Kabyle Synth Voice dataset to get a transcript, a spellchecked version, and an English translation.
248
  """
249
  )
250
 
 
257
  sources=["upload", "microphone"],
258
  format="mp3",
259
  )
260
+ apply_sc = gr.Checkbox(
261
+ label="✅ Activer la correction orthographique (Hunspell kabyle)",
262
+ value=True,
263
+ info="Corrige automatiquement les mots non reconnus par le dictionnaire kabyle"
264
+ )
265
+ transcribe_btn = gr.Button("🚀 Transcribe, Spellcheck & Translate", variant="primary", size="lg")
266
 
267
  with gr.Column(scale=2):
268
+ with gr.Row():
269
+ text_output_raw = gr.Textbox(
270
+ label="📝 Transcription brute (ASR)",
271
+ lines=3,
272
+ info="Sortie directe du modèle de reconnaissance vocale"
273
+ )
274
+ text_output_checked = gr.Textbox(
275
+ label="✅ Transcription corrigée (Hunspell)",
276
+ lines=3,
277
+ info="Transcription après correction orthographique automatique"
278
+ )
279
  translation_output_1 = gr.Textbox(
280
  label="LibreTranslate (English)",
281
+ lines=3,
282
+ placeholder="English translation will appear here..."
283
  )
284
 
285
  transcribe_btn.click(
286
  fn=process_audio,
287
+ inputs=[audio_input, apply_sc],
288
+ outputs=[text_output_raw, text_output_checked, translation_output_1],
289
  )
290
 
291
  gr.Examples(
 
310
 
311
  with gr.Column(scale=2):
312
  random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
313
+ with gr.Row():
314
+ text_output_3_raw = gr.Textbox(label="Transcription brute (Kabyle)", lines=3)
315
+ text_output_3_checked = gr.Textbox(label="Transcription corrigée (Hunspell)", lines=3)
316
  translation_output_3 = gr.Textbox(
317
  label="LibreTranslate (English)",
318
+ lines=3,
319
  placeholder="English LibreTranslate translation will appear here..."
320
  )
321
 
322
  def process_random_with_status():
323
+ yield "⏳ Fetching random sample...", None, "", "", ""
 
324
  try:
325
  audio_path = download_random_dataset_sample()
326
  except Exception as e:
327
+ yield f"❌ Dataset Error: {str(e)}", None, "", "", ""
328
  return
329
 
330
+ yield "⏳ Transcribing & spellchecking...", audio_path, "", "", ""
331
+ transcript, spellchecked, translation = process_audio(audio_path)
332
 
333
+ yield "✅ Done!", audio_path, transcript, spellchecked, translation
 
 
 
334
 
335
  random_btn.click(
336
  fn=process_random_with_status,
337
  inputs=[],
338
+ outputs=[dataset_status, random_audio_player, text_output_3_raw, text_output_3_checked, translation_output_3],
339
  )
340
 
341
  gr.Markdown(
 
343
  ---
344
  Developed by [G1ya777](https://github.com/G1ya777/Mmeslay).
345
  Examples from Tatoeba (CC BY licenses).
346
+ Spellcheck powered by Hunspell kabyle dictionary.
347
  """
348
  )
349
 
 
356
  server_port=port,
357
  max_file_size=f"{MAX_SIZE_MB}mb",
358
  theme=gr.themes.Soft(),
359
+ )