Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Kabyle Semantic Toolkit | |
| Hugging Face Space using boffire/kabyle-sentence-transformer-mpnet | |
| """ | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| import torch.nn.functional as F | |
| import os | |
| # Load model once | |
| print("Loading model...") | |
| MODEL = SentenceTransformer("boffire/kabyle-sentence-transformer-mpnet") | |
| print("Model loaded") | |
| # Pre-load and pre-compute search index at startup | |
| print("Pre-computing search index...") | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("Imsidag-community/english-kabyle-parallel", split="train") | |
| SEARCH_PAIRS = [(row["en"], row["kab"]) for row in ds.select(range(min(500, len(ds))))] | |
| except Exception as e: | |
| print("Could not load dataset, using fallback: " + str(e)) | |
| SEARCH_PAIRS = [ | |
| ("Hello!", "Azul!"), | |
| ("How are you?", "Amek i telliḍ?"), | |
| ("Thank you", "Tanemmirt"), | |
| ("Good morning", "Tifawin"), | |
| ("Water is life", "Aman d tudert"), | |
| ] | |
| # Pre-compute embeddings once at startup | |
| _all_texts = [en for en, _ in SEARCH_PAIRS] + [kab for _, kab in SEARCH_PAIRS] | |
| SEARCH_EMBEDDINGS = MODEL.encode(_all_texts, convert_to_tensor=True, show_progress_bar=False) | |
| print("Search index ready: " + str(len(SEARCH_PAIRS)) + " pairs") | |
| def get_embeddings(texts): | |
| return MODEL.encode(texts, convert_to_tensor=True) | |
| def check_quality(en_text, kab_text): | |
| """Tab 1: Translation Quality Checker""" | |
| if not en_text.strip() or not kab_text.strip(): | |
| return "Please enter both sentences", None | |
| emb = get_embeddings([en_text, kab_text]) | |
| sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item() | |
| if sim > 0.85: | |
| quality = "Excellent match" | |
| elif sim > 0.6: | |
| quality = "Good match" | |
| else: | |
| quality = "Poor match" | |
| result = "Similarity: " + str(round(sim, 4)) + os.linesep + "Quality: " + quality | |
| return result, sim | |
| def search_similar(query, top_k=5): | |
| """Tab 2: Semantic Search - fast because embeddings are pre-computed""" | |
| if not query.strip(): | |
| return "Please enter a query" | |
| query_emb = get_embeddings([query]) | |
| # Search both English and Kabyle sides | |
| scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy() | |
| top_indices = np.argsort(scores)[::-1][:top_k] | |
| results = [] | |
| seen = set() | |
| for idx in top_indices: | |
| if idx < len(SEARCH_PAIRS): | |
| pair = SEARCH_PAIRS[idx] | |
| else: | |
| pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)] | |
| key = pair[0] + " || " + pair[1] | |
| if key not in seen: | |
| seen.add(key) | |
| results.append(pair[1] + os.linesep + " (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4))) | |
| return (os.linesep + os.linesep).join(results) if results else "No results found" | |
| def validate_csv(file): | |
| """Tab 3: Parallel Data Validator""" | |
| if file is None: | |
| return None, "Please upload a CSV file with 'en' and 'kab' columns" | |
| df = pd.read_csv(file.name) | |
| if "en" not in df.columns or "kab" not in df.columns: | |
| return None, "CSV must have 'en' and 'kab' columns" | |
| scores = [] | |
| for _, row in df.iterrows(): | |
| emb = get_embeddings([str(row["en"]), str(row["kab"])]) | |
| sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item() | |
| scores.append(sim) | |
| df["similarity"] = scores | |
| df["quality"] = df["similarity"].apply( | |
| lambda s: "good" if s > 0.6 else "poor" | |
| ) | |
| # Save result | |
| output_path = "/tmp/validated_pairs.csv" | |
| df.to_csv(output_path, index=False) | |
| summary = "Processed " + str(len(df)) + " pairs" + os.linesep | |
| summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + os.linesep | |
| summary += "Poor quality: " + str(len(df[df["quality"]=="poor"])) | |
| return output_path, summary | |
| # Build UI with Soft theme | |
| with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # Kabyle Semantic Toolkit | |
| Powered by [**boffire/kabyle-sentence-transformer-mpnet**](https://huggingface.co/boffire/kabyle-sentence-transformer-mpnet) | |
| This tool understands meaning, not just words. Use it to check translations, | |
| find similar sentences, or validate your parallel data. | |
| """) | |
| with gr.Tabs(): | |
| # Tab 1: Quality Checker | |
| with gr.TabItem("Translation Quality"): | |
| gr.Markdown("Check if an English-Kabyle pair has similar meaning.") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| en_input = gr.Textbox( | |
| label="English", | |
| placeholder="Enter English text...", | |
| lines=3 | |
| ) | |
| kab_input = gr.Textbox( | |
| label="Kabyle", | |
| placeholder="Enter Kabyle text...", | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| clear_btn_1 = gr.Button("Clear", variant="secondary") | |
| check_btn = gr.Button("Check Quality", variant="primary") | |
| with gr.Column(scale=3): | |
| result_text = gr.Textbox( | |
| label="Result", | |
| lines=3, | |
| interactive=False | |
| ) | |
| score_bar = gr.Slider( | |
| 0, 1, | |
| label="Similarity Score", | |
| interactive=False | |
| ) | |
| check_btn.click( | |
| fn=check_quality, | |
| inputs=[en_input, kab_input], | |
| outputs=[result_text, score_bar] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Hello!", "Azul!"], | |
| ["The computer works.", "Aselkim iteddu."], | |
| ["I love you.", "Hemmleɣ-kent."], | |
| ["Hello!", "Aselkim iteddu."], | |
| ], | |
| inputs=[en_input, kab_input], | |
| label="Try these examples" | |
| ) | |
| clear_btn_1.click( | |
| fn=lambda: ("", "", "", None), | |
| outputs=[en_input, kab_input, result_text, score_bar] | |
| ) | |
| # Tab 2: Similar Search | |
| with gr.TabItem("Similar Sentences"): | |
| gr.Markdown("Find Kabyle sentences similar to your query. Search index is pre-loaded for instant results.") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| query_input = gr.Textbox( | |
| label="Query (English or Kabyle)", | |
| placeholder="Enter text to search...", | |
| lines=3 | |
| ) | |
| top_k_slider = gr.Slider( | |
| 1, 10, | |
| value=5, | |
| step=1, | |
| label="Number of results" | |
| ) | |
| with gr.Row(): | |
| clear_btn_2 = gr.Button("Clear", variant="secondary") | |
| search_btn = gr.Button("Search", variant="primary") | |
| with gr.Column(scale=3): | |
| search_output = gr.Textbox( | |
| label="Results", | |
| lines=10, | |
| interactive=False | |
| ) | |
| search_btn.click( | |
| fn=search_similar, | |
| inputs=[query_input, top_k_slider], | |
| outputs=search_output | |
| ) | |
| gr.Examples( | |
| examples=["How are you?", "Thank you", "Water is life"], | |
| inputs=query_input, | |
| label="Example queries" | |
| ) | |
| clear_btn_2.click( | |
| fn=lambda: ("", 5, ""), | |
| outputs=[query_input, top_k_slider, search_output] | |
| ) | |
| # Tab 3: Data Validator | |
| with gr.TabItem("Data Validator"): | |
| gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| file_input = gr.File( | |
| label="Upload CSV", | |
| file_types=[".csv"] | |
| ) | |
| validate_btn = gr.Button("Validate", variant="primary") | |
| with gr.Column(scale=3): | |
| summary_output = gr.Textbox( | |
| label="Summary", | |
| lines=4, | |
| interactive=False | |
| ) | |
| download_output = gr.File(label="Download Results") | |
| validate_btn.click( | |
| fn=validate_csv, | |
| inputs=file_input, | |
| outputs=[download_output, summary_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **Related tools**: | |
| [LibreTranslate](https://imsidag-community-libretranslate-kabyle.hf.space/) | | |
| [MarianMT](https://huggingface.co/boffire/marianmt-en-kab) | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |