boffire commited on
Commit
f3276ae
·
verified ·
1 Parent(s): c0117a0

Update src/gradio_app.py

Browse files
Files changed (1) hide show
  1. src/gradio_app.py +49 -42
src/gradio_app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import os
 
3
  import requests
4
  import soundfile as sf
5
  import tempfile
@@ -9,6 +10,11 @@ import random
9
  import difflib
10
  from spylls.hunspell import Dictionary
11
 
 
 
 
 
 
12
  # --- Configuration ---
13
  MAX_SIZE_MB = "50"
14
  MAX_SECONDS = 60
@@ -22,10 +28,10 @@ DATASET_API_TREE_URL = f"https://huggingface.co/api/datasets/{DATASET_REPO}/tree
22
 
23
  # --- Hunspell Dictionary Configuration ---
24
  DICT_DIR = os.path.join(os.path.dirname(__file__), "dicts")
25
- DICT_BASE_PATH = os.path.join(DICT_DIR, "kab") # spylls attend le chemin de base sans extension
26
 
27
- # Caractères de ponctuation à stripper (définis avec des escapes Unicode pour éviter les problèmes d'encodage)
28
- PUNCTUATION_CHARS = '.,!?;:"\'()[]{}«»—–-'
29
 
30
  _hunspell_dict = None
31
 
@@ -37,45 +43,51 @@ def get_hunspell():
37
  dic_path = DICT_BASE_PATH + ".dic"
38
  if not os.path.exists(aff_path) or not os.path.exists(dic_path):
39
  raise FileNotFoundError(
40
- f"Dictionnaire Hunspell kabyle non trouvé.\n"
41
  f"Attendu: {aff_path} et {dic_path}\n"
42
  f"Veuillez uploader les fichiers kab.aff et kab.dic dans le dossier 'dicts/' de votre Space."
43
  )
44
- # spylls: from_files() prend un seul argument (chemin de base sans extension)
45
  _hunspell_dict = Dictionary.from_files(DICT_BASE_PATH)
46
  return _hunspell_dict
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def correct_word(word: str) -> str:
49
  """Corrige un mot unique avec Hunspell. Retourne le mot original s'il est correct ou sans suggestion fiable."""
50
  dic = get_hunspell()
51
 
52
- # Nettoyage: séparer ponctuation
53
  stripped = word.strip(PUNCTUATION_CHARS).lower()
54
  if not stripped:
55
  return word
56
 
57
- # Vérifier si le mot est valide
58
- if dic.lookup(stripped):
59
- return word # mot correct, on garde la forme originale
60
 
61
- # Récupérer les suggestions
62
- suggestions = list(dic.suggest(stripped))
63
  if not suggestions:
64
- return word # pas de suggestion, on garde l'erreur ASR
65
 
66
- # Choisir la meilleure suggestion par similarité (Levenshtein-like via SequenceMatcher)
67
  best = max(suggestions, key=lambda s: difflib.SequenceMatcher(None, stripped, s).ratio())
68
 
69
- # Vérifier que la suggestion est suffisamment proche (éviter les substitutions totalement différentes)
70
  similarity = difflib.SequenceMatcher(None, stripped, best).ratio()
71
  if similarity < 0.5:
72
- return word # suggestion trop éloignée, on garde l'original
73
 
74
- # Restaurer la casse originale
75
  if word[0].isupper():
76
  best = best[0].upper() + best[1:]
77
 
78
- # Restaurer la ponctuation attachée
79
  prefix_len = len(word) - len(word.lstrip(PUNCTUATION_CHARS))
80
  suffix_len = len(word) - len(word.rstrip(PUNCTUATION_CHARS))
81
  prefix = word[:prefix_len]
@@ -85,13 +97,12 @@ def correct_word(word: str) -> str:
85
 
86
  def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, list[dict]]:
87
  """
88
- Vérifie la transcription mot par mot avec Hunspell.
89
- Retourne: (texte_corrigé, liste_des_corrections_appliquées)
90
  """
91
  if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
92
  return text, []
93
 
94
- # Tokenisation simple: séparer par espaces tout en préservant la ponctuation attachée
95
  words = text.split()
96
  corrected_words = []
97
  corrections = []
@@ -100,10 +111,9 @@ def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, li
100
  if auto_correct:
101
  corrected = correct_word(word)
102
  else:
103
- # Mode suggestion seule: on ne corrige pas, on signale juste
104
  dic = get_hunspell()
105
  stripped = word.strip(PUNCTUATION_CHARS).lower()
106
- corrected = word if (not stripped or dic.lookup(stripped)) else word + " [?]"
107
 
108
  corrected_words.append(corrected)
109
 
@@ -120,7 +130,6 @@ def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, li
120
  def translate_to_english(text):
121
  if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
122
  return ""
123
- # Nettoyer le texte des marqueurs de correction avant traduction
124
  clean_text = regex.sub(r"\s*\[\?\]", "", text)
125
  payload = {
126
  'q': clean_text,
@@ -197,31 +206,30 @@ def format_transcript(text: str) -> str:
197
  def process_audio(audio_file, apply_spellcheck=True):
198
  """Handles validation -> Transcription -> Spellcheck -> Translation."""
199
  if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
200
- return "⚠️ Please upload an audio file first.", "", ""
201
 
202
  if isinstance(audio_file, str):
203
  try:
204
  info = sf.info(audio_file)
205
  if info.duration > MAX_SECONDS:
206
- return f"Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", "", ""
207
  except Exception as e:
208
- return f"Error reading audio info: {str(e)}", "", ""
209
 
210
  try:
211
  from inference_file import inference
212
  transcript = inference(audio_file)
213
  transcript = format_transcript(transcript)
214
 
215
- # --- NOUVEAU: Spellcheck ---
216
  spellchecked = transcript
217
  corrections = []
218
  if apply_spellcheck:
219
  try:
220
  spellchecked, corrections = spellcheck_transcript(transcript, auto_correct=True)
221
  except FileNotFoundError:
222
- spellchecked = transcript + "\n\n⚠️ Dictionnaire Hunspell non trouvé — correction orthographique désactivée."
223
  except Exception as e:
224
- spellchecked = transcript + f"\n\n⚠️ Erreur Hunspell: {str(e)}"
225
 
226
  translation = translate_to_english(spellchecked)
227
 
@@ -238,7 +246,6 @@ def process_random_dataset():
238
 
239
  transcript, spellchecked, translation = process_audio(audio_path)
240
 
241
- # Note: on ne supprime pas le fichier ici car Gradio en a besoin pour le lecteur audio
242
  return audio_path, transcript, spellchecked, translation
243
 
244
  # --- Build Gradio UI ---
@@ -263,7 +270,7 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
263
  format="mp3",
264
  )
265
  apply_sc = gr.Checkbox(
266
- label="Activer la correction orthographique (Hunspell kabyle)",
267
  value=True,
268
  info="Corrige automatiquement les mots non reconnus par le dictionnaire kabyle"
269
  )
@@ -272,14 +279,14 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
272
  with gr.Column(scale=2):
273
  with gr.Row():
274
  text_output_raw = gr.Textbox(
275
- label="📝 Transcription brute (ASR)",
276
  lines=3,
277
- info="Sortie directe du modèle de reconnaissance vocale"
278
  )
279
  text_output_checked = gr.Textbox(
280
- label="Transcription corrigée (Hunspell)",
281
  lines=3,
282
- info="Transcription après correction orthographique automatique"
283
  )
284
  translation_output_1 = gr.Textbox(
285
  label="LibreTranslate (English)",
@@ -302,7 +309,7 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
302
  inputs=audio_input,
303
  )
304
 
305
- with gr.Tab("🎲 Random Dataset Sample"):
306
  with gr.Row():
307
  with gr.Column(scale=1):
308
  gr.Markdown(
@@ -310,14 +317,14 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
310
  Click the button below to fetch a **random audio sample** from the [Kabyle Synth Voice](https://huggingface.co/datasets/boffire/kabyle-synth-voice) dataset.
311
  """
312
  )
313
- random_btn = gr.Button("🎲 Pick Random & Transcribe", variant="primary", size="lg")
314
  dataset_status = gr.Textbox(label="Status", interactive=False, value="Ready")
315
 
316
  with gr.Column(scale=2):
317
  random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
318
  with gr.Row():
319
  text_output_3_raw = gr.Textbox(label="Transcription brute (Kabyle)", lines=3)
320
- text_output_3_checked = gr.Textbox(label="Transcription corrigée (Hunspell)", lines=3)
321
  translation_output_3 = gr.Textbox(
322
  label="LibreTranslate (English)",
323
  lines=3,
@@ -325,17 +332,17 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
325
  )
326
 
327
  def process_random_with_status():
328
- yield "Fetching random sample...", None, "", "", ""
329
  try:
330
  audio_path = download_random_dataset_sample()
331
  except Exception as e:
332
- yield f"Dataset Error: {str(e)}", None, "", "", ""
333
  return
334
 
335
- yield "Transcribing & spellchecking...", audio_path, "", "", ""
336
  transcript, spellchecked, translation = process_audio(audio_path)
337
 
338
- yield "Done!", audio_path, transcript, spellchecked, translation
339
 
340
  random_btn.click(
341
  fn=process_random_with_status,
 
1
  import gradio as gr
2
  import os
3
+ import sys
4
  import requests
5
  import soundfile as sf
6
  import tempfile
 
10
  import difflib
11
  from spylls.hunspell import Dictionary
12
 
13
+ # --- Fix: augmenter la limite de recursion pour spylls ---
14
+ # Certains dictionnaires Hunspell ont des regles d'affixation complexes
15
+ # qui depassent la limite Python par defaut (1000)
16
+ sys.setrecursionlimit(3000)
17
+
18
  # --- Configuration ---
19
  MAX_SIZE_MB = "50"
20
  MAX_SECONDS = 60
 
28
 
29
  # --- Hunspell Dictionary Configuration ---
30
  DICT_DIR = os.path.join(os.path.dirname(__file__), "dicts")
31
+ DICT_BASE_PATH = os.path.join(DICT_DIR, "kab")
32
 
33
+ # Caracteres de ponctuation a stripper
34
+ PUNCTUATION_CHARS = '.,!?;:\"\'()[]{}\u00ab\u00bb\u2014\u2013-'
35
 
36
  _hunspell_dict = None
37
 
 
43
  dic_path = DICT_BASE_PATH + ".dic"
44
  if not os.path.exists(aff_path) or not os.path.exists(dic_path):
45
  raise FileNotFoundError(
46
+ f"Dictionnaire Hunspell kabyle non trouve.\n"
47
  f"Attendu: {aff_path} et {dic_path}\n"
48
  f"Veuillez uploader les fichiers kab.aff et kab.dic dans le dossier 'dicts/' de votre Space."
49
  )
 
50
  _hunspell_dict = Dictionary.from_files(DICT_BASE_PATH)
51
  return _hunspell_dict
52
 
53
+ def safe_lookup(dic, word: str) -> bool:
54
+ """Wrapper securise pour dic.lookup() avec gestion RecursionError."""
55
+ try:
56
+ return dic.lookup(word)
57
+ except RecursionError:
58
+ return False
59
+
60
+ def safe_suggest(dic, word: str) -> list:
61
+ """Wrapper securise pour dic.suggest() avec gestion RecursionError."""
62
+ try:
63
+ return list(dic.suggest(word))
64
+ except RecursionError:
65
+ return []
66
+
67
  def correct_word(word: str) -> str:
68
  """Corrige un mot unique avec Hunspell. Retourne le mot original s'il est correct ou sans suggestion fiable."""
69
  dic = get_hunspell()
70
 
 
71
  stripped = word.strip(PUNCTUATION_CHARS).lower()
72
  if not stripped:
73
  return word
74
 
75
+ if safe_lookup(dic, stripped):
76
+ return word
 
77
 
78
+ suggestions = safe_suggest(dic, stripped)
 
79
  if not suggestions:
80
+ return word
81
 
 
82
  best = max(suggestions, key=lambda s: difflib.SequenceMatcher(None, stripped, s).ratio())
83
 
 
84
  similarity = difflib.SequenceMatcher(None, stripped, best).ratio()
85
  if similarity < 0.5:
86
+ return word
87
 
 
88
  if word[0].isupper():
89
  best = best[0].upper() + best[1:]
90
 
 
91
  prefix_len = len(word) - len(word.lstrip(PUNCTUATION_CHARS))
92
  suffix_len = len(word) - len(word.rstrip(PUNCTUATION_CHARS))
93
  prefix = word[:prefix_len]
 
97
 
98
  def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, list[dict]]:
99
  """
100
+ Verifie la transcription mot par mot avec Hunspell.
101
+ Retourne: (texte_corrige, liste_des_corrections_appliquees)
102
  """
103
  if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
104
  return text, []
105
 
 
106
  words = text.split()
107
  corrected_words = []
108
  corrections = []
 
111
  if auto_correct:
112
  corrected = correct_word(word)
113
  else:
 
114
  dic = get_hunspell()
115
  stripped = word.strip(PUNCTUATION_CHARS).lower()
116
+ corrected = word if (not stripped or safe_lookup(dic, stripped)) else word + " [?]"
117
 
118
  corrected_words.append(corrected)
119
 
 
130
  def translate_to_english(text):
131
  if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
132
  return ""
 
133
  clean_text = regex.sub(r"\s*\[\?\]", "", text)
134
  payload = {
135
  'q': clean_text,
 
206
  def process_audio(audio_file, apply_spellcheck=True):
207
  """Handles validation -> Transcription -> Spellcheck -> Translation."""
208
  if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
209
+ return "Please upload an audio file first.", "", ""
210
 
211
  if isinstance(audio_file, str):
212
  try:
213
  info = sf.info(audio_file)
214
  if info.duration > MAX_SECONDS:
215
+ return f"Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", "", ""
216
  except Exception as e:
217
+ return f"Error reading audio info: {str(e)}", "", ""
218
 
219
  try:
220
  from inference_file import inference
221
  transcript = inference(audio_file)
222
  transcript = format_transcript(transcript)
223
 
 
224
  spellchecked = transcript
225
  corrections = []
226
  if apply_spellcheck:
227
  try:
228
  spellchecked, corrections = spellcheck_transcript(transcript, auto_correct=True)
229
  except FileNotFoundError:
230
+ spellchecked = transcript + "\n\nDictionnaire Hunspell non trouve — correction orthographique desactivee."
231
  except Exception as e:
232
+ spellchecked = transcript + f"\n\nErreur Hunspell: {str(e)}"
233
 
234
  translation = translate_to_english(spellchecked)
235
 
 
246
 
247
  transcript, spellchecked, translation = process_audio(audio_path)
248
 
 
249
  return audio_path, transcript, spellchecked, translation
250
 
251
  # --- Build Gradio UI ---
 
270
  format="mp3",
271
  )
272
  apply_sc = gr.Checkbox(
273
+ label="Activer la correction orthographique (Hunspell kabyle)",
274
  value=True,
275
  info="Corrige automatiquement les mots non reconnus par le dictionnaire kabyle"
276
  )
 
279
  with gr.Column(scale=2):
280
  with gr.Row():
281
  text_output_raw = gr.Textbox(
282
+ label="Transcription brute (ASR)",
283
  lines=3,
284
+ info="Sortie directe du modele de reconnaissance vocale"
285
  )
286
  text_output_checked = gr.Textbox(
287
+ label="Transcription corrigee (Hunspell)",
288
  lines=3,
289
+ info="Transcription apres correction orthographique automatique"
290
  )
291
  translation_output_1 = gr.Textbox(
292
  label="LibreTranslate (English)",
 
309
  inputs=audio_input,
310
  )
311
 
312
+ with gr.Tab("Random Dataset Sample"):
313
  with gr.Row():
314
  with gr.Column(scale=1):
315
  gr.Markdown(
 
317
  Click the button below to fetch a **random audio sample** from the [Kabyle Synth Voice](https://huggingface.co/datasets/boffire/kabyle-synth-voice) dataset.
318
  """
319
  )
320
+ random_btn = gr.Button("Pick Random & Transcribe", variant="primary", size="lg")
321
  dataset_status = gr.Textbox(label="Status", interactive=False, value="Ready")
322
 
323
  with gr.Column(scale=2):
324
  random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
325
  with gr.Row():
326
  text_output_3_raw = gr.Textbox(label="Transcription brute (Kabyle)", lines=3)
327
+ text_output_3_checked = gr.Textbox(label="Transcription corrigee (Hunspell)", lines=3)
328
  translation_output_3 = gr.Textbox(
329
  label="LibreTranslate (English)",
330
  lines=3,
 
332
  )
333
 
334
  def process_random_with_status():
335
+ yield "Fetching random sample...", None, "", "", ""
336
  try:
337
  audio_path = download_random_dataset_sample()
338
  except Exception as e:
339
+ yield f"Dataset Error: {str(e)}", None, "", "", ""
340
  return
341
 
342
+ yield "Transcribing & spellchecking...", audio_path, "", "", ""
343
  transcript, spellchecked, translation = process_audio(audio_path)
344
 
345
+ yield "Done!", audio_path, transcript, spellchecked, translation
346
 
347
  random_btn.click(
348
  fn=process_random_with_status,