boffire's picture
Update app.py
7a06b89 verified
#!/usr/bin/env python3
"""
Kabyle Semantic Toolkit
Hugging Face Space using boffire/kabyle-sentence-transformer-mpnet
"""
import warnings
warnings.filterwarnings("ignore")
import gradio as gr
import torch
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
import os
# Load model once
print("Loading model...")
MODEL = SentenceTransformer("boffire/kabyle-sentence-transformer-mpnet")
print("Model loaded")
# Pre-load and pre-compute search index at startup
print("Pre-computing search index...")
try:
from datasets import load_dataset
ds = load_dataset("Imsidag-community/english-kabyle-parallel", split="train")
SEARCH_PAIRS = [(row["en"], row["kab"]) for row in ds.select(range(min(500, len(ds))))]
except Exception as e:
print("Could not load dataset, using fallback: " + str(e))
SEARCH_PAIRS = [
("Hello!", "Azul!"),
("How are you?", "Amek i telliḍ?"),
("Thank you", "Tanemmirt"),
("Good morning", "Tifawin"),
("Water is life", "Aman d tudert"),
]
# Pre-compute embeddings once at startup
_all_texts = [en for en, _ in SEARCH_PAIRS] + [kab for _, kab in SEARCH_PAIRS]
SEARCH_EMBEDDINGS = MODEL.encode(_all_texts, convert_to_tensor=True, show_progress_bar=False)
print("Search index ready: " + str(len(SEARCH_PAIRS)) + " pairs")
def get_embeddings(texts):
return MODEL.encode(texts, convert_to_tensor=True)
def check_quality(en_text, kab_text):
"""Tab 1: Translation Quality Checker"""
if not en_text.strip() or not kab_text.strip():
return "Please enter both sentences", None
emb = get_embeddings([en_text, kab_text])
sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
if sim > 0.85:
quality = "Excellent match"
elif sim > 0.6:
quality = "Good match"
else:
quality = "Poor match"
result = "Similarity: " + str(round(sim, 4)) + os.linesep + "Quality: " + quality
return result, sim
def search_similar(query, top_k=5):
"""Tab 2: Semantic Search - fast because embeddings are pre-computed"""
if not query.strip():
return "Please enter a query"
query_emb = get_embeddings([query])
# Search both English and Kabyle sides
scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy()
top_indices = np.argsort(scores)[::-1][:top_k]
results = []
seen = set()
for idx in top_indices:
if idx < len(SEARCH_PAIRS):
pair = SEARCH_PAIRS[idx]
else:
pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)]
key = pair[0] + " || " + pair[1]
if key not in seen:
seen.add(key)
results.append(pair[1] + os.linesep + " (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4)))
return (os.linesep + os.linesep).join(results) if results else "No results found"
def validate_csv(file):
"""Tab 3: Parallel Data Validator"""
if file is None:
return None, "Please upload a CSV file with 'en' and 'kab' columns"
df = pd.read_csv(file.name)
if "en" not in df.columns or "kab" not in df.columns:
return None, "CSV must have 'en' and 'kab' columns"
scores = []
for _, row in df.iterrows():
emb = get_embeddings([str(row["en"]), str(row["kab"])])
sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
scores.append(sim)
df["similarity"] = scores
df["quality"] = df["similarity"].apply(
lambda s: "good" if s > 0.6 else "poor"
)
# Save result
output_path = "/tmp/validated_pairs.csv"
df.to_csv(output_path, index=False)
summary = "Processed " + str(len(df)) + " pairs" + os.linesep
summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + os.linesep
summary += "Poor quality: " + str(len(df[df["quality"]=="poor"]))
return output_path, summary
# Build UI with Soft theme
with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# Kabyle Semantic Toolkit
Powered by [**boffire/kabyle-sentence-transformer-mpnet**](https://huggingface.co/boffire/kabyle-sentence-transformer-mpnet)
This tool understands meaning, not just words. Use it to check translations,
find similar sentences, or validate your parallel data.
""")
with gr.Tabs():
# Tab 1: Quality Checker
with gr.TabItem("Translation Quality"):
gr.Markdown("Check if an English-Kabyle pair has similar meaning.")
with gr.Row():
with gr.Column(scale=2):
en_input = gr.Textbox(
label="English",
placeholder="Enter English text...",
lines=3
)
kab_input = gr.Textbox(
label="Kabyle",
placeholder="Enter Kabyle text...",
lines=3
)
with gr.Row():
clear_btn_1 = gr.Button("Clear", variant="secondary")
check_btn = gr.Button("Check Quality", variant="primary")
with gr.Column(scale=3):
result_text = gr.Textbox(
label="Result",
lines=3,
interactive=False
)
score_bar = gr.Slider(
0, 1,
label="Similarity Score",
interactive=False
)
check_btn.click(
fn=check_quality,
inputs=[en_input, kab_input],
outputs=[result_text, score_bar]
)
gr.Examples(
examples=[
["Hello!", "Azul!"],
["The computer works.", "Aselkim iteddu."],
["I love you.", "Hemmleɣ-kent."],
["Hello!", "Aselkim iteddu."],
],
inputs=[en_input, kab_input],
label="Try these examples"
)
clear_btn_1.click(
fn=lambda: ("", "", "", None),
outputs=[en_input, kab_input, result_text, score_bar]
)
# Tab 2: Similar Search
with gr.TabItem("Similar Sentences"):
gr.Markdown("Find Kabyle sentences similar to your query. Search index is pre-loaded for instant results.")
with gr.Row():
with gr.Column(scale=2):
query_input = gr.Textbox(
label="Query (English or Kabyle)",
placeholder="Enter text to search...",
lines=3
)
top_k_slider = gr.Slider(
1, 10,
value=5,
step=1,
label="Number of results"
)
with gr.Row():
clear_btn_2 = gr.Button("Clear", variant="secondary")
search_btn = gr.Button("Search", variant="primary")
with gr.Column(scale=3):
search_output = gr.Textbox(
label="Results",
lines=10,
interactive=False
)
search_btn.click(
fn=search_similar,
inputs=[query_input, top_k_slider],
outputs=search_output
)
gr.Examples(
examples=["How are you?", "Thank you", "Water is life"],
inputs=query_input,
label="Example queries"
)
clear_btn_2.click(
fn=lambda: ("", 5, ""),
outputs=[query_input, top_k_slider, search_output]
)
# Tab 3: Data Validator
with gr.TabItem("Data Validator"):
gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.")
with gr.Row():
with gr.Column(scale=2):
file_input = gr.File(
label="Upload CSV",
file_types=[".csv"]
)
validate_btn = gr.Button("Validate", variant="primary")
with gr.Column(scale=3):
summary_output = gr.Textbox(
label="Summary",
lines=4,
interactive=False
)
download_output = gr.File(label="Download Results")
validate_btn.click(
fn=validate_csv,
inputs=file_input,
outputs=[download_output, summary_output]
)
gr.Markdown("""
---
**Related tools**:
[LibreTranslate](https://imsidag-community-libretranslate-kabyle.hf.space/) |
[MarianMT](https://huggingface.co/boffire/marianmt-en-kab)
""")
if __name__ == "__main__":
demo.launch()