Mmeslay-with-LibreTranslate

Sleeping

App Files Files Community

boffire commited on 14 days ago

Commit

120f10a

verified ·

1 Parent(s): 5aac2d2

Update src/gradio_app.py

Browse files

Files changed (1) hide show

src/gradio_app.py +157 -45

src/gradio_app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import tempfile
 import re as regex
 import glob
 import random
 # --- Configuration ---
 MAX_SIZE_MB = "50"
@@ -18,12 +20,105 @@ DATASET_REPO = "boffire/kabyle-synth-voice"
 DATASET_AUDIO_BASE_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/audio"
 DATASET_API_TREE_URL = f"https://huggingface.co/api/datasets/{DATASET_REPO}/tree/main/audio"
 # --- Translation Logic ---
 def translate_to_english(text):
     if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
         return ""
     payload = {
-        'q': text,
         'source': 'kab',
         'target': 'en',
         'format': 'text',
@@ -50,7 +145,6 @@ def get_dataset_audio_files():
         resp = requests.get(DATASET_API_TREE_URL, timeout=15)
         resp.raise_for_status()
         items = resp.json()
-        # Filter only .wav files and extract filenames
         files = [
             item["path"].replace("audio/", "")
             for item in items
@@ -72,7 +166,6 @@ def download_random_dataset_sample() -> str:
     tmp_dir = tempfile.gettempdir()
     local_path = os.path.join(tmp_dir, f"dataset_{filename}")
-    # Download the file
     try:
         resp = requests.get(file_url, timeout=30, stream=True)
         resp.raise_for_status()
@@ -91,62 +184,67 @@ def format_transcript(text: str) -> str:
     text = text.strip()
     if not text:
         return text
-    # Capitalize first letter
     text = text[0].upper() + text[1:]
-    # Add trailing period if missing and last char is not already punctuation
     if text and text[-1] not in ".!?":
         text += "."
     return text
-def process_audio(audio_file):
-    """Handles validation -> Transcription -> Translation."""
     if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
-        return "⚠️ Please upload an audio file first.", ""
     if isinstance(audio_file, str):
         try:
             info = sf.info(audio_file)
             if info.duration > MAX_SECONDS:
-                return f"❌ Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", ""
         except Exception as e:
-            return f"❌ Error reading audio info: {str(e)}", ""
     try:
         from inference_file import inference
         transcript = inference(audio_file)
         transcript = format_transcript(transcript)
-        translation = translate_to_english(transcript)
-        return transcript, translation
     except Exception as e:
-        return f"❌ Error during processing: {str(e)}", ""
 def process_random_dataset():
     """Downloads a random sample from the dataset and runs ASR."""
     try:
         audio_path = download_random_dataset_sample()
     except Exception as e:
-        return None, f"❌ Dataset Error: {str(e)}", ""
-    transcript, translation = process_audio(audio_path)
-    # Cleanup temp file
-    try:
-        if os.path.exists(audio_path):
-            os.remove(audio_path)
-    except Exception:
-        pass
-    return audio_path, transcript, translation
 # --- Build Gradio UI ---
 with gr.Blocks(title="🎙️ Mmeslay") as demo:
     gr.Markdown(
         """
         # 🎙️ Mmeslay by [G1ya777](https://github.com/G1ya777/Mmeslay)
-        ### Kabyle ASR & Translation
-        *Powered by Squeezeformer (ASR) and LibreTranslate (NMT)*
-        Upload a Kabyle audio file, record directly, **or pick a random sample** from the Kabyle Synth Voice dataset to get a transcript and English translation.
         """
     )
@@ -159,20 +257,35 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
                     sources=["upload", "microphone"],
                     format="mp3",
                 )
-                transcribe_btn = gr.Button("🚀 Transcribe & Translate", variant="primary", size="lg")
             with gr.Column(scale=2):
-                text_output_1 = gr.Textbox(label="Transcription (Kabyle)", lines=5)
                 translation_output_1 = gr.Textbox(
                     label="LibreTranslate (English)",
-                    lines=5,
-                    placeholder="English LibreTranslate translation will appear here..."
                 )
         transcribe_btn.click(
             fn=process_audio,
-            inputs=audio_input,
-            outputs=[text_output_1, translation_output_1],
         )
         gr.Examples(
@@ -197,34 +310,32 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
             with gr.Column(scale=2):
                 random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
-                text_output_3 = gr.Textbox(label="Transcription (Kabyle)", lines=5)
                 translation_output_3 = gr.Textbox(
                     label="LibreTranslate (English)",
-                    lines=5,
                     placeholder="English LibreTranslate translation will appear here..."
                 )
         def process_random_with_status():
-            # Update status
-            yield "⏳ Fetching random sample...", None, "", ""
             try:
                 audio_path = download_random_dataset_sample()
             except Exception as e:
-                yield f"❌ Dataset Error: {str(e)}", None, "", ""
                 return
-            yield "⏳ Transcribing...", audio_path, "", ""
-            transcript, translation = process_audio(audio_path)
-            # Note: we keep audio_path in the output so the user can listen,
-            # but we don't delete it here — Gradio needs the file to serve it.
-            # It will be cleaned up by the OS temp cleanup eventually.
-            yield "✅ Done!", audio_path, transcript, translation
         random_btn.click(
             fn=process_random_with_status,
             inputs=[],
-            outputs=[dataset_status, random_audio_player, text_output_3, translation_output_3],
         )
     gr.Markdown(
@@ -232,6 +343,7 @@ with gr.Blocks(title="🎙️ Mmeslay") as demo:
         ---
         Developed by [G1ya777](https://github.com/G1ya777/Mmeslay).
         Examples from Tatoeba (CC BY licenses).
         """
     )
@@ -244,4 +356,4 @@ if __name__ == "__main__":
         server_port=port,
         max_file_size=f"{MAX_SIZE_MB}mb",
         theme=gr.themes.Soft(),
-    )

 import re as regex
 import glob
 import random
+import difflib
+from spylls.hunspell import Dictionary
 # --- Configuration ---
 MAX_SIZE_MB = "50"
 DATASET_AUDIO_BASE_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/audio"
 DATASET_API_TREE_URL = f"https://huggingface.co/api/datasets/{DATASET_REPO}/tree/main/audio"
+# --- Hunspell Dictionary Configuration ---
+DICT_DIR = os.path.join(os.path.dirname(__file__), "dicts")
+AFF_PATH = os.path.join(DICT_DIR, "kab.aff")
+DIC_PATH = os.path.join(DICT_DIR, "kab.dic")
+_hunspell_dict = None
+def get_hunspell():
+    """Lazy-load the Hunspell dictionary."""
+    global _hunspell_dict
+    if _hunspell_dict is None:
+        if not os.path.exists(AFF_PATH) or not os.path.exists(DIC_PATH):
+            raise FileNotFoundError(
+                f"Dictionnaire Hunspell kabyle non trouvé.\n"
+                f"Attendu: {AFF_PATH} et {DIC_PATH}\n"
+                f"Veuillez uploader les fichiers kab.aff et kab.dic dans le dossier 'dicts/' de votre Space."
+            )
+        _hunspell_dict = Dictionary.from_files(AFF_PATH, DIC_PATH)
+    return _hunspell_dict
+def correct_word(word: str) -> str:
+    """Corrige un mot unique avec Hunspell. Retourne le mot original s'il est correct ou sans suggestion fiable."""
+    dic = get_hunspell()
+    # Nettoyage: séparer ponctuation
+    stripped = word.strip(".,!?;:"'()[]{}«»—–-").lower()
+    if not stripped:
+        return word
+    # Vérifier si le mot est valide
+    if dic.lookup(stripped):
+        return word  # mot correct, on garde la forme originale
+    # Récupérer les suggestions
+    suggestions = list(dic.suggest(stripped))
+    if not suggestions:
+        return word  # pas de suggestion, on garde l'erreur ASR
+    # Choisir la meilleure suggestion par similarité (Levenshtein-like via SequenceMatcher)
+    best = max(suggestions, key=lambda s: difflib.SequenceMatcher(None, stripped, s).ratio())
+    # Vérifier que la suggestion est suffisamment proche (éviter les substitutions totalement différentes)
+    similarity = difflib.SequenceMatcher(None, stripped, best).ratio()
+    if similarity < 0.5:
+        return word  # suggestion trop éloignée, on garde l'original
+    # Restaurer la casse originale
+    if word[0].isupper():
+        best = best[0].upper() + best[1:]
+    # Restaurer la ponctuation attachée
+    prefix_len = len(word) - len(word.lstrip(".,!?;:"'()[]{}«»—–-"))
+    suffix_len = len(word) - len(word.rstrip(".,!?;:"'()[]{}«»—–-"))
+    prefix = word[:prefix_len]
+    suffix = word[-suffix_len:] if suffix_len > 0 else ""
+    return prefix + best + suffix
+def spellcheck_transcript(text: str, auto_correct: bool = True) -> tuple[str, list[dict]]:
+    """
+    Vérifie la transcription mot par mot avec Hunspell.
+    Retourne: (texte_corrigé, liste_des_corrections_appliquées)
+    """
+    if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
+        return text, []
+    # Tokenisation simple: séparer par espaces tout en préservant la ponctuation attachée
+    words = text.split()
+    corrected_words = []
+    corrections = []
+    for i, word in enumerate(words):
+        if auto_correct:
+            corrected = correct_word(word)
+        else:
+            # Mode suggestion seule: on ne corrige pas, on signale juste
+            dic = get_hunspell()
+            stripped = word.strip(".,!?;:"'()[]{}«»—–-").lower()
+            corrected = word if (not stripped or dic.lookup(stripped)) else word + " [?]"
+        corrected_words.append(corrected)
+        if corrected != word:
+            corrections.append({
+                "position": i,
+                "original": word,
+                "suggestion": corrected
+            })
+    return " ".join(corrected_words), corrections
 # --- Translation Logic ---
 def translate_to_english(text):
     if not text or any(symbol in text for symbol in ["⚠️", "❌"]):
         return ""
+    # Nettoyer le texte des marqueurs de correction avant traduction
+    clean_text = regex.sub(r"\s*\[\?\]", "", text)
     payload = {
+        'q': clean_text,
         'source': 'kab',
         'target': 'en',
         'format': 'text',
         resp = requests.get(DATASET_API_TREE_URL, timeout=15)
         resp.raise_for_status()
         items = resp.json()
         files = [
             item["path"].replace("audio/", "")
             for item in items
     tmp_dir = tempfile.gettempdir()
     local_path = os.path.join(tmp_dir, f"dataset_{filename}")
     try:
         resp = requests.get(file_url, timeout=30, stream=True)
         resp.raise_for_status()
     text = text.strip()
     if not text:
         return text
     text = text[0].upper() + text[1:]
     if text and text[-1] not in ".!?":
         text += "."
     return text
+def process_audio(audio_file, apply_spellcheck=True):
+    """Handles validation -> Transcription -> Spellcheck -> Translation."""
     if audio_file is None or (isinstance(audio_file, str) and audio_file.strip() == ""):
+        return "⚠️ Please upload an audio file first.", "", ""
     if isinstance(audio_file, str):
         try:
             info = sf.info(audio_file)
             if info.duration > MAX_SECONDS:
+                return f"❌ Audio too long ({info.duration:.1f}s). Max is {MAX_SECONDS}s.", "", ""
         except Exception as e:
+            return f"❌ Error reading audio info: {str(e)}", "", ""
     try:
         from inference_file import inference
         transcript = inference(audio_file)
         transcript = format_transcript(transcript)
+        # --- NOUVEAU: Spellcheck ---
+        spellchecked = transcript
+        corrections = []
+        if apply_spellcheck:
+            try:
+                spellchecked, corrections = spellcheck_transcript(transcript, auto_correct=True)
+            except FileNotFoundError:
+                spellchecked = transcript + "\n\n⚠️ Dictionnaire Hunspell non trouvé — correction orthographique désactivée."
+            except Exception as e:
+                spellchecked = transcript + f"\n\n⚠️ Erreur Hunspell: {str(e)}"
+        translation = translate_to_english(spellchecked)
+        return transcript, spellchecked, translation
     except Exception as e:
+        return f"❌ Error during processing: {str(e)}", "", ""
 def process_random_dataset():
     """Downloads a random sample from the dataset and runs ASR."""
     try:
         audio_path = download_random_dataset_sample()
     except Exception as e:
+        return None, f"❌ Dataset Error: {str(e)}", "", ""
+    transcript, spellchecked, translation = process_audio(audio_path)
+    # Note: on ne supprime pas le fichier ici car Gradio en a besoin pour le lecteur audio
+    return audio_path, transcript, spellchecked, translation
 # --- Build Gradio UI ---
 with gr.Blocks(title="🎙️ Mmeslay") as demo:
     gr.Markdown(
         """
         # 🎙️ Mmeslay by [G1ya777](https://github.com/G1ya777/Mmeslay)
+        ### Kabyle ASR, Spellcheck & Translation
+        *Powered by Squeezeformer (ASR), Hunspell (Spellcheck) and LibreTranslate (NMT)*
+        Upload a Kabyle audio file, record directly, **or pick a random sample** from the Kabyle Synth Voice dataset to get a transcript, a spellchecked version, and an English translation.
         """
     )
                     sources=["upload", "microphone"],
                     format="mp3",
                 )
+                apply_sc = gr.Checkbox(
+                    label="✅ Activer la correction orthographique (Hunspell kabyle)",
+                    value=True,
+                    info="Corrige automatiquement les mots non reconnus par le dictionnaire kabyle"
+                )
+                transcribe_btn = gr.Button("🚀 Transcribe, Spellcheck & Translate", variant="primary", size="lg")
             with gr.Column(scale=2):
+                with gr.Row():
+                    text_output_raw = gr.Textbox(
+                        label="📝 Transcription brute (ASR)",
+                        lines=3,
+                        info="Sortie directe du modèle de reconnaissance vocale"
+                    )
+                    text_output_checked = gr.Textbox(
+                        label="✅ Transcription corrigée (Hunspell)",
+                        lines=3,
+                        info="Transcription après correction orthographique automatique"
+                    )
                 translation_output_1 = gr.Textbox(
                     label="LibreTranslate (English)",
+                    lines=3,
+                    placeholder="English translation will appear here..."
                 )
         transcribe_btn.click(
             fn=process_audio,
+            inputs=[audio_input, apply_sc],
+            outputs=[text_output_raw, text_output_checked, translation_output_1],
         )
         gr.Examples(
             with gr.Column(scale=2):
                 random_audio_player = gr.Audio(label="🎵 Selected Sample", interactive=False, autoplay=False)
+                with gr.Row():
+                    text_output_3_raw = gr.Textbox(label="Transcription brute (Kabyle)", lines=3)
+                    text_output_3_checked = gr.Textbox(label="Transcription corrigée (Hunspell)", lines=3)
                 translation_output_3 = gr.Textbox(
                     label="LibreTranslate (English)",
+                    lines=3,
                     placeholder="English LibreTranslate translation will appear here..."
                 )
         def process_random_with_status():
+            yield "⏳ Fetching random sample...", None, "", "", ""
             try:
                 audio_path = download_random_dataset_sample()
             except Exception as e:
+                yield f"❌ Dataset Error: {str(e)}", None, "", "", ""
                 return
+            yield "⏳ Transcribing & spellchecking...", audio_path, "", "", ""
+            transcript, spellchecked, translation = process_audio(audio_path)
+            yield "✅ Done!", audio_path, transcript, spellchecked, translation
         random_btn.click(
             fn=process_random_with_status,
             inputs=[],
+            outputs=[dataset_status, random_audio_player, text_output_3_raw, text_output_3_checked, translation_output_3],
         )
     gr.Markdown(
         ---
         Developed by [G1ya777](https://github.com/G1ya777/Mmeslay).
         Examples from Tatoeba (CC BY licenses).
+        Spellcheck powered by Hunspell kabyle dictionary.
         """
     )
         server_port=port,
         max_file_size=f"{MAX_SIZE_MB}mb",
         theme=gr.themes.Soft(),
+    )