Mmeslay-with-LibreTranslate

Sleeping

App Files Files Community

boffire commited on 14 days ago

Commit

f3276ae

verified ·

1 Parent(s): c0117a0

Update src/gradio_app.py

Browse files

Files changed (1) hide show

src/gradio_app.py +49 -42

src/gradio_app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import os
 import requests
 import soundfile as sf
 import tempfile
@@ -9,6 +10,11 @@ import random
 import difflib
 from spylls.hunspell import Dictionary
 # --- Configuration ---
 MAX_SIZE_MB = "50"
 MAX_SECONDS = 60
@@ -22,10 +28,10 @@ DATASET_API_TREE_URL = f"https://huggingface.co/api/datasets/{DATASET_REPO}/tree
 # --- Hunspell Dictionary Configuration ---
 DICT_DIR = os.path.join(os.path.dirname(__file__), "dicts")
-DICT_BASE_PATH = os.path.join(DICT_DIR, "kab")  # spylls attend le chemin de base sans extension
-# Caractères de ponctuation à stripper (définis avec des escapes Unicode pour éviter les problèmes d'encodage)
-PUNCTUATION_CHARS = '.,!?;:"\'()[]{}«»—–-'
 _hunspell_dict = None
@@ -37,45 +43,51 @@ def get_hunspell():
         dic_path = DICT_BASE_PATH + ".dic"
         if not os.path.exists(aff_path) or not os.path.exists(dic_path):
             raise FileNotFoundError(
-                f"Dictionnaire Hunspell kabyle non trouvé.\n"
                 f"Attendu: {aff_path} et {dic_path}\n"
                 f"Veuillez uploader les fichiers kab.aff et kab.dic dans le dossier 'dicts/' de votre Space."
             )
-        # spylls: from_files() prend un seul argument (chemin de base sans extension)
         _hunspell_dict = Dictionary.from_files(DICT_BASE_PATH)
     return _hunspell_dict
 def correct_word(word: str) -> str:
     """Corrige un mot unique avec Hunspell. Retourne le mot original s'il est correct ou sans suggestion fiable."""
     dic = get_hunspell()
-    # Nettoyage: séparer ponctuation
     stripped = word.strip(PUNCTUATION_CHARS).lower()
     if not stripped:
         return word
-    # Vérifier si le mot est valide
-    if dic.lookup(stripped):
-        return word  # mot correct, on garde la forme originale
-    # Récupérer les suggestions
-    suggestions = list(dic.suggest(stripped))
     if not suggestions:
-        return word  # pas de suggestion, on garde l'erreur ASR
-    # Choisir la meilleure suggestion par similarité (Levenshtein-like via SequenceMatcher)
     best = max(suggestions, key=lambda s: difflib.SequenceMatcher(None, stripped, s).ratio())
-    # Vérifier que la suggestion est suffisamment proche (éviter les substitutions totalement différentes)
     similarity = difflib.SequenceMatcher(None, stripped, best).ratio()
     if similarity < 0.5:
-        return word  # suggestion trop éloignée, on garde l'original
-    # Restaurer la casse originale
     if word[0].isupper():
         best = best[0].upper() + best[1:]
-    # Restaurer la ponctuation attachée
     prefix_len = len(word) - len(word.lstrip(PUNCTUATION_CHARS))
     suffix_len = len(word) - len(word.rstrip(PUNCTUATION_CHARS))
     prefix = word[:prefix_len]
@@ -85,13 +97,12 @@ def correct_word(word: str) -> str:
 def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, list[dict]]:
     """
-    Vérifie la transcription mot par mot avec Hunspell.
-    Retourne: (texte_corrigé, liste_des_corrections_appliquées)
     """
     if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
         return text, []
-    # Tokenisation simple: séparer par espaces tout en préservant la ponctuation attachée
     words = text.split()
     corrected_words = []
     corrections = []
@@ -100,10 +111,9 @@ def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, li
         if auto_correct:
             corrected = correct_word(word)
         else:
-            # Mode suggestion seule: on ne corrige pas, on signale juste
             dic = get_hunspell()
             stripped = word.strip(PUNCTUATION_CHARS).lower()
-            corrected = word if (not stripped or dic.lookup(stripped)) else word + " [?]"
         corrected_words.append(corrected)
@@ -120,7 +130,6 @@ def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, li
 def translate_to_english(text):
     if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
         return ""
-    # Nettoyer le texte des marqueurs de correction avant traduction
     clean_text = regex.sub(r"\s*\[\?\]", "", text)
     payload = {
         'q': clean_text,
@@ -197,31 +206,30 @@ def format_transcript(text: str) -> str:
 def process_audio(audio_file, apply_spellcheck=True):
     """Handles validation -> Transcription -> Spellcheck -> Translation."""
     if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
-        return "⚠️ Please upload an audio file first.", "", ""
     if isinstance(audio_file, str):
         try:
             info = sf.info(audio_file)
             if info.duration > MAX_SECONDS:
-                return f"❌ Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", "", ""
         except Exception as e:
-            return f"❌ Error reading audio info: {str(e)}", "", ""
     try:
         from inference_file import inference
         transcript = inference(audio_file)
         transcript = format_transcript(transcript)
-        # --- NOUVEAU: Spellcheck ---
         spellchecked = transcript
         corrections = []
         if apply_spellcheck:
             try:
                 spellchecked, corrections = spellcheck_transcript(transcript, auto_correct=True)
             except FileNotFoundError:
-                spellchecked = transcript + "\n\n⚠️ Dictionnaire Hunspell non trouvé — correction orthographique désactivée."
             except Exception as e:
-                spellchecked = transcript + f"\n\n⚠️ Erreur Hunspell: {str(e)}"
         translation = translate_to_english(spellchecked)
@@ -238,7 +246,6 @@ def process_random_dataset():
     transcript, spellchecked, translation = process_audio(audio_path)
-    # Note: on ne supprime pas le fichier ici car Gradio en a besoin pour le lecteur audio
     return audio_path, transcript, spellchecked, translation
 # --- Build Gradio UI ---
@@ -263,7 +270,7 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
                     format="mp3",
                 )
                 apply_sc = gr.Checkbox(
-                    label="✅ Activer la correction orthographique (Hunspell kabyle)",
                     value=True,
                     info="Corrige automatiquement les mots non reconnus par le dictionnaire kabyle"
                 )
@@ -272,14 +279,14 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
             with gr.Column(scale=2):
                 with gr.Row():
                     text_output_raw = gr.Textbox(
-                        label="📝 Transcription brute (ASR)",
                         lines=3,
-                        info="Sortie directe du modèle de reconnaissance vocale"
                     )
                     text_output_checked = gr.Textbox(
-                        label="✅ Transcription corrigée (Hunspell)",
                         lines=3,
-                        info="Transcription après correction orthographique automatique"
                     )
                 translation_output_1 = gr.Textbox(
                     label="LibreTranslate (English)",
@@ -302,7 +309,7 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
             inputs=audio_input,
         )
-    with gr.Tab("🎲 Random Dataset Sample"):
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown(
@@ -310,14 +317,14 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
                     Click the button below to fetch a **random audio sample** from the [Kabyle Synth Voice](https://huggingface.co/datasets/boffire/kabyle-synth-voice) dataset.
                     """
                 )
-                random_btn = gr.Button("🎲 Pick Random & Transcribe", variant="primary", size="lg")
                 dataset_status = gr.Textbox(label="Status", interactive=False, value="Ready")
             with gr.Column(scale=2):
                 random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
                 with gr.Row():
                     text_output_3_raw = gr.Textbox(label="Transcription brute (Kabyle)", lines=3)
-                    text_output_3_checked = gr.Textbox(label="Transcription corrigée (Hunspell)", lines=3)
                 translation_output_3 = gr.Textbox(
                     label="LibreTranslate (English)",
                     lines=3,
@@ -325,17 +332,17 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
                 )
         def process_random_with_status():
-            yield "⏳ Fetching random sample...", None, "", "", ""
             try:
                 audio_path = download_random_dataset_sample()
             except Exception as e:
-                yield f"❌ Dataset Error: {str(e)}", None, "", "", ""
                 return
-            yield "⏳ Transcribing & spellchecking...", audio_path, "", "", ""
             transcript, spellchecked, translation = process_audio(audio_path)
-            yield "✅ Done!", audio_path, transcript, spellchecked, translation
         random_btn.click(
             fn=process_random_with_status,

 import gradio as gr
 import os
+import sys
 import requests
 import soundfile as sf
 import tempfile
 import difflib
 from spylls.hunspell import Dictionary
+# --- Fix: augmenter la limite de recursion pour spylls ---
+# Certains dictionnaires Hunspell ont des regles d'affixation complexes
+# qui depassent la limite Python par defaut (1000)
+sys.setrecursionlimit(3000)
 # --- Configuration ---
 MAX_SIZE_MB = "50"
 MAX_SECONDS = 60
 # --- Hunspell Dictionary Configuration ---
 DICT_DIR = os.path.join(os.path.dirname(__file__), "dicts")
+DICT_BASE_PATH = os.path.join(DICT_DIR, "kab")
+# Caracteres de ponctuation a stripper
+PUNCTUATION_CHARS = '.,!?;:\"\'()[]{}\u00ab\u00bb\u2014\u2013-'
 _hunspell_dict = None
         dic_path = DICT_BASE_PATH + ".dic"
         if not os.path.exists(aff_path) or not os.path.exists(dic_path):
             raise FileNotFoundError(
+                f"Dictionnaire Hunspell kabyle non trouve.\n"
                 f"Attendu: {aff_path} et {dic_path}\n"
                 f"Veuillez uploader les fichiers kab.aff et kab.dic dans le dossier 'dicts/' de votre Space."
             )
         _hunspell_dict = Dictionary.from_files(DICT_BASE_PATH)
     return _hunspell_dict
+def safe_lookup(dic, word: str) -> bool:
+    """Wrapper securise pour dic.lookup() avec gestion RecursionError."""
+    try:
+        return dic.lookup(word)
+    except RecursionError:
+        return False
+def safe_suggest(dic, word: str) -> list:
+    """Wrapper securise pour dic.suggest() avec gestion RecursionError."""
+    try:
+        return list(dic.suggest(word))
+    except RecursionError:
+        return []
 def correct_word(word: str) -> str:
     """Corrige un mot unique avec Hunspell. Retourne le mot original s'il est correct ou sans suggestion fiable."""
     dic = get_hunspell()
     stripped = word.strip(PUNCTUATION_CHARS).lower()
     if not stripped:
         return word
+    if safe_lookup(dic, stripped):
+        return word
+    suggestions = safe_suggest(dic, stripped)
     if not suggestions:
+        return word
     best = max(suggestions, key=lambda s: difflib.SequenceMatcher(None, stripped, s).ratio())
     similarity = difflib.SequenceMatcher(None, stripped, best).ratio()
     if similarity < 0.5:
+        return word
     if word[0].isupper():
         best = best[0].upper() + best[1:]
     prefix_len = len(word) - len(word.lstrip(PUNCTUATION_CHARS))
     suffix_len = len(word) - len(word.rstrip(PUNCTUATION_CHARS))
     prefix = word[:prefix_len]
 def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, list[dict]]:
     """
+    Verifie la transcription mot par mot avec Hunspell.
+    Retourne: (texte_corrige, liste_des_corrections_appliquees)
     """
     if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
         return text, []
     words = text.split()
     corrected_words = []
     corrections = []
         if auto_correct:
             corrected = correct_word(word)
         else:
             dic = get_hunspell()
             stripped = word.strip(PUNCTUATION_CHARS).lower()
+            corrected = word if (not stripped or safe_lookup(dic, stripped)) else word + " [?]"
         corrected_words.append(corrected)
 def translate_to_english(text):
     if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
         return ""
     clean_text = regex.sub(r"\s*\[\?\]", "", text)
     payload = {
         'q': clean_text,
 def process_audio(audio_file, apply_spellcheck=True):
     """Handles validation -> Transcription -> Spellcheck -> Translation."""
     if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
+        return "Please upload an audio file first.", "", ""
     if isinstance(audio_file, str):
         try:
             info = sf.info(audio_file)
             if info.duration > MAX_SECONDS:
+                return f"Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", "", ""
         except Exception as e:
+            return f"Error reading audio info: {str(e)}", "", ""
     try:
         from inference_file import inference
         transcript = inference(audio_file)
         transcript = format_transcript(transcript)
         spellchecked = transcript
         corrections = []
         if apply_spellcheck:
             try:
                 spellchecked, corrections = spellcheck_transcript(transcript, auto_correct=True)
             except FileNotFoundError:
+                spellchecked = transcript + "\n\nDictionnaire Hunspell non trouve — correction orthographique desactivee."
             except Exception as e:
+                spellchecked = transcript + f"\n\nErreur Hunspell: {str(e)}"
         translation = translate_to_english(spellchecked)
     transcript, spellchecked, translation = process_audio(audio_path)
     return audio_path, transcript, spellchecked, translation
 # --- Build Gradio UI ---
                     format="mp3",
                 )
                 apply_sc = gr.Checkbox(
+                    label="Activer la correction orthographique (Hunspell kabyle)",
                     value=True,
                     info="Corrige automatiquement les mots non reconnus par le dictionnaire kabyle"
                 )
             with gr.Column(scale=2):
                 with gr.Row():
                     text_output_raw = gr.Textbox(
+                        label="Transcription brute (ASR)",
                         lines=3,
+                        info="Sortie directe du modele de reconnaissance vocale"
                     )
                     text_output_checked = gr.Textbox(
+                        label="Transcription corrigee (Hunspell)",
                         lines=3,
+                        info="Transcription apres correction orthographique automatique"
                     )
                 translation_output_1 = gr.Textbox(
                     label="LibreTranslate (English)",
             inputs=audio_input,
         )
+    with gr.Tab("Random Dataset Sample"):
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown(
                     Click the button below to fetch a **random audio sample** from the [Kabyle Synth Voice](https://huggingface.co/datasets/boffire/kabyle-synth-voice) dataset.
                     """
                 )
+                random_btn = gr.Button("Pick Random & Transcribe", variant="primary", size="lg")
                 dataset_status = gr.Textbox(label="Status", interactive=False, value="Ready")
             with gr.Column(scale=2):
                 random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
                 with gr.Row():
                     text_output_3_raw = gr.Textbox(label="Transcription brute (Kabyle)", lines=3)
+                    text_output_3_checked = gr.Textbox(label="Transcription corrigee (Hunspell)", lines=3)
                 translation_output_3 = gr.Textbox(
                     label="LibreTranslate (English)",
                     lines=3,
                 )
         def process_random_with_status():
+            yield "Fetching random sample...", None, "", "", ""
             try:
                 audio_path = download_random_dataset_sample()
             except Exception as e:
+                yield f"Dataset Error: {str(e)}", None, "", "", ""
                 return
+            yield "Transcribing & spellchecking...", audio_path, "", "", ""
             transcript, spellchecked, translation = process_audio(audio_path)
+            yield "Done!", audio_path, transcript, spellchecked, translation
         random_btn.click(
             fn=process_random_with_status,