boffire commited on
Commit
a50a91d
·
verified ·
1 Parent(s): 6ef958c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -0
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Kabyle Semantic Toolkit
4
+ Hugging Face Space using boffire/kabyle-sentence-transformer-mpnet
5
+ """
6
+
7
+ import warnings
8
+ warnings.filterwarnings("ignore")
9
+ import gradio as gr
10
+ import torch
11
+ import numpy as np
12
+ import pandas as pd
13
+ from sentence_transformers import SentenceTransformer
14
+ import torch.nn.functional as F
15
+
16
+ # Load model once
17
+ print("Loading model...")
18
+ MODEL = SentenceTransformer("boffire/kabyle-sentence-transformer-mpnet")
19
+ print("Model loaded")
20
+
21
+ # Pre-load Tatoeba index for search
22
+ print("Loading search index...")
23
+ try:
24
+ from datasets import load_dataset
25
+ ds = load_dataset("Imsidag-community/english-kabyle-parallel", split="train")
26
+ SEARCH_PAIRS = [(row["en"], row["kab"]) for row in ds.select(range(min(1000, len(ds))))]
27
+ except:
28
+ SEARCH_PAIRS = [
29
+ ("Hello!", "Azul!"),
30
+ ("How are you?", "Amek i telliḍ?"),
31
+ ("Thank you", "Tanemmirt"),
32
+ ("Good morning", "Bessif"),
33
+ ("Water is life", "Aman d tudert"),
34
+ ]
35
+
36
+ SEARCH_EMBEDDINGS = None
37
+
38
+ def get_embeddings(texts):
39
+ return MODEL.encode(texts, convert_to_tensor=True)
40
+
41
+ def check_quality(en_text, kab_text):
42
+ """Tab 1: Translation Quality Checker"""
43
+ if not en_text.strip() or not kab_text.strip():
44
+ return "Please enter both sentences", None
45
+
46
+ emb = get_embeddings([en_text, kab_text])
47
+ sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
48
+
49
+ if sim > 0.85:
50
+ quality = "Excellent match"
51
+ color = "green"
52
+ elif sim > 0.6:
53
+ quality = "Good match"
54
+ color = "orange"
55
+ else:
56
+ quality = "Poor match"
57
+ color = "red"
58
+
59
+ result = "Similarity: " + str(round(sim, 4)) + "
60
+ Quality: " + quality
61
+ return result, sim
62
+
63
+ def search_similar(query, top_k=5):
64
+ """Tab 2: Semantic Search"""
65
+ global SEARCH_EMBEDDINGS
66
+
67
+ if not query.strip():
68
+ return "Please enter a query"
69
+
70
+ if SEARCH_EMBEDDINGS is None:
71
+ all_texts = [en for en, _ in SEARCH_PAIRS] + [kab for _, kab in SEARCH_PAIRS]
72
+ SEARCH_EMBEDDINGS = get_embeddings(all_texts)
73
+
74
+ query_emb = get_embeddings([query])
75
+
76
+ # Search both English and Kabyle sides
77
+ scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy()
78
+ top_indices = np.argsort(scores)[::-1][:top_k]
79
+
80
+ results = []
81
+ seen = set()
82
+ for idx in top_indices:
83
+ if idx < len(SEARCH_PAIRS):
84
+ pair = SEARCH_PAIRS[idx]
85
+ else:
86
+ pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)]
87
+
88
+ key = pair[0] + " || " + pair[1]
89
+ if key not in seen:
90
+ seen.add(key)
91
+ results.append(pair[1] + "
92
+ (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4)))
93
+
94
+ return "
95
+
96
+ ".join(results) if results else "No results found"
97
+
98
+ def validate_csv(file):
99
+ """Tab 3: Parallel Data Validator"""
100
+ if file is None:
101
+ return None, "Please upload a CSV file with 'en' and 'kab' columns"
102
+
103
+ df = pd.read_csv(file.name)
104
+ if "en" not in df.columns or "kab" not in df.columns:
105
+ return None, "CSV must have 'en' and 'kab' columns"
106
+
107
+ scores = []
108
+ for _, row in df.iterrows():
109
+ emb = get_embeddings([str(row["en"]), str(row["kab"])])
110
+ sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
111
+ scores.append(sim)
112
+
113
+ df["similarity"] = scores
114
+ df["quality"] = df["similarity"].apply(
115
+ lambda s: "good" if s > 0.6 else "poor"
116
+ )
117
+
118
+ # Save result
119
+ output_path = "/tmp/validated_pairs.csv"
120
+ df.to_csv(output_path, index=False)
121
+
122
+ summary = "Processed " + str(len(df)) + " pairs
123
+ "
124
+ summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + "
125
+ "
126
+ summary += "Poor quality: " + str(len(df[df["quality"]=="poor"]))
127
+
128
+ return output_path, summary
129
+
130
+ # Build UI
131
+ with gr.Blocks(title="Kabyle Semantic Toolkit") as demo:
132
+ gr.Markdown("""
133
+ # Kabyle Semantic Toolkit
134
+
135
+ Powered by **boffire/kabyle-sentence-transformer-mpnet**
136
+
137
+ This tool understands meaning, not just words. Use it to check translations,
138
+ find similar sentences, or validate your parallel data.
139
+ """)
140
+
141
+ with gr.Tabs():
142
+
143
+ # Tab 1: Quality Checker
144
+ with gr.TabItem("Translation Quality"):
145
+ gr.Markdown("Check if an English-Kabyle pair has similar meaning.")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ en_input = gr.Textbox(label="English", placeholder="Enter English text...")
150
+ kab_input = gr.Textbox(label="Kabyle", placeholder="Enter Kabyle text...")
151
+ check_btn = gr.Button("Check Quality", variant="primary")
152
+
153
+ with gr.Column():
154
+ result_text = gr.Textbox(label="Result", lines=3, interactive=False)
155
+ score_bar = gr.Slider(0, 1, label="Similarity Score", interactive=False)
156
+
157
+ check_btn.click(
158
+ fn=check_quality,
159
+ inputs=[en_input, kab_input],
160
+ outputs=[result_text, score_bar]
161
+ )
162
+
163
+ gr.Examples(
164
+ examples=[
165
+ ["Hello!", "Azul!"],
166
+ ["The computer works.", "Aselkim iteddu."],
167
+ ["I love you.", "Hemmleɣ-k."],
168
+ ["Hello!", "Aselkim iteddu."],
169
+ ],
170
+ inputs=[en_input, kab_input],
171
+ label="Try these examples"
172
+ )
173
+
174
+ # Tab 2: Similar Search
175
+ with gr.TabItem("Similar Sentences"):
176
+ gr.Markdown("Find Kabyle sentences similar to your query.")
177
+
178
+ query_input = gr.Textbox(
179
+ label="Query (English or Kabyle)",
180
+ placeholder="Enter text to search..."
181
+ )
182
+ top_k_slider = gr.Slider(1, 10, value=5, step=1, label="Number of results")
183
+ search_btn = gr.Button("Search", variant="primary")
184
+ search_output = gr.Textbox(label="Results", lines=10, interactive=False)
185
+
186
+ search_btn.click(
187
+ fn=search_similar,
188
+ inputs=[query_input, top_k_slider],
189
+ outputs=search_output
190
+ )
191
+
192
+ gr.Examples(
193
+ examples=["How are you?", "Thank you", "Water is life"],
194
+ inputs=query_input,
195
+ label="Example queries"
196
+ )
197
+
198
+ # Tab 3: Data Validator
199
+ with gr.TabItem("Data Validator"):
200
+ gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.")
201
+
202
+ file_input = gr.File(label="Upload CSV", file_types=[".csv"])
203
+ validate_btn = gr.Button("Validate", variant="primary")
204
+
205
+ with gr.Row():
206
+ download_output = gr.File(label="Download Results")
207
+ summary_output = gr.Textbox(label="Summary", lines=4, interactive=False)
208
+
209
+ validate_btn.click(
210
+ fn=validate_csv,
211
+ inputs=file_input,
212
+ outputs=[download_output, summary_output]
213
+ )
214
+
215
+ gr.Markdown("""
216
+ ---
217
+ **Related tools**:
218
+ [LibreTranslate](https://imsidag-community-libretranslate-kabyle.hf.space/) |
219
+ [MarianMT](https://huggingface.co/boffire/marianmt-en-kab)
220
+ """)
221
+
222
+ if __name__ == "__main__":
223
+ demo.launch()