boffire commited on
Commit
243e89b
·
verified ·
1 Parent(s): 669fa24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -74
app.py CHANGED
@@ -18,13 +18,14 @@ print("Loading model...")
18
  MODEL = SentenceTransformer("boffire/kabyle-sentence-transformer-mpnet")
19
  print("Model loaded")
20
 
21
- # Pre-load Tatoeba index for search
22
- print("Loading search index...")
23
  try:
24
  from datasets import load_dataset
25
  ds = load_dataset("Imsidag-community/english-kabyle-parallel", split="train")
26
- SEARCH_PAIRS = [(row["en"], row["kab"]) for row in ds.select(range(min(1000, len(ds))))]
27
- except:
 
28
  SEARCH_PAIRS = [
29
  ("Hello!", "Azul!"),
30
  ("How are you?", "Amek i telliḍ?"),
@@ -33,7 +34,10 @@ except:
33
  ("Water is life", "Aman d tudert"),
34
  ]
35
 
36
- SEARCH_EMBEDDINGS = None
 
 
 
37
 
38
  def get_embeddings(texts):
39
  return MODEL.encode(texts, convert_to_tensor=True)
@@ -42,37 +46,32 @@ def check_quality(en_text, kab_text):
42
  """Tab 1: Translation Quality Checker"""
43
  if not en_text.strip() or not kab_text.strip():
44
  return "Please enter both sentences", None
45
-
46
  emb = get_embeddings([en_text, kab_text])
47
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
48
-
49
  if sim > 0.85:
50
  quality = "Excellent match"
51
  elif sim > 0.6:
52
  quality = "Good match"
53
  else:
54
  quality = "Poor match"
55
-
56
- result = "Similarity: " + str(round(sim, 4)) + "\nQuality: " + quality
 
57
  return result, sim
58
 
59
  def search_similar(query, top_k=5):
60
- """Tab 2: Semantic Search"""
61
- global SEARCH_EMBEDDINGS
62
-
63
  if not query.strip():
64
  return "Please enter a query"
65
-
66
- if SEARCH_EMBEDDINGS is None:
67
- all_texts = [en for en, _ in SEARCH_PAIRS] + [kab for _, kab in SEARCH_PAIRS]
68
- SEARCH_EMBEDDINGS = get_embeddings(all_texts)
69
-
70
  query_emb = get_embeddings([query])
71
-
72
  # Search both English and Kabyle sides
73
  scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy()
74
  top_indices = np.argsort(scores)[::-1][:top_k]
75
-
76
  results = []
77
  seen = set()
78
  for idx in top_indices:
@@ -80,77 +79,100 @@ def search_similar(query, top_k=5):
80
  pair = SEARCH_PAIRS[idx]
81
  else:
82
  pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)]
83
-
84
  key = pair[0] + " || " + pair[1]
85
  if key not in seen:
86
  seen.add(key)
87
- results.append(pair[1] + "\n (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4)))
88
-
89
- return "\n\n".join(results) if results else "No results found"
 
 
 
90
 
91
  def validate_csv(file):
92
  """Tab 3: Parallel Data Validator"""
93
  if file is None:
94
  return None, "Please upload a CSV file with 'en' and 'kab' columns"
95
-
96
  df = pd.read_csv(file.name)
97
  if "en" not in df.columns or "kab" not in df.columns:
98
  return None, "CSV must have 'en' and 'kab' columns"
99
-
100
  scores = []
101
  for _, row in df.iterrows():
102
  emb = get_embeddings([str(row["en"]), str(row["kab"])])
103
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
104
  scores.append(sim)
105
-
106
  df["similarity"] = scores
107
  df["quality"] = df["similarity"].apply(
108
  lambda s: "good" if s > 0.6 else "poor"
109
  )
110
-
111
  # Save result
112
  output_path = "/tmp/validated_pairs.csv"
113
  df.to_csv(output_path, index=False)
114
-
115
- summary = "Processed " + str(len(df)) + " pairs\n"
116
- summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + "\n"
 
 
117
  summary += "Poor quality: " + str(len(df[df["quality"]=="poor"]))
118
-
119
  return output_path, summary
120
 
121
- # Build UI
122
- with gr.Blocks(title="Kabyle Semantic Toolkit") as demo:
123
  gr.Markdown("""
124
  # Kabyle Semantic Toolkit
125
-
126
- Powered by **boffire/kabyle-sentence-transformer-mpnet**
127
-
128
- This tool understands meaning, not just words. Use it to check translations,
129
  find similar sentences, or validate your parallel data.
130
  """)
131
-
132
  with gr.Tabs():
133
-
134
  # Tab 1: Quality Checker
135
  with gr.TabItem("Translation Quality"):
136
  gr.Markdown("Check if an English-Kabyle pair has similar meaning.")
137
-
138
  with gr.Row():
139
- with gr.Column():
140
- en_input = gr.Textbox(label="English", placeholder="Enter English text...")
141
- kab_input = gr.Textbox(label="Kabyle", placeholder="Enter Kabyle text...")
142
- check_btn = gr.Button("Check Quality", variant="primary")
143
-
144
- with gr.Column():
145
- result_text = gr.Textbox(label="Result", lines=3, interactive=False)
146
- score_bar = gr.Slider(0, 1, label="Similarity Score", interactive=False)
147
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  check_btn.click(
149
  fn=check_quality,
150
  inputs=[en_input, kab_input],
151
  outputs=[result_text, score_bar]
152
  )
153
-
154
  gr.Examples(
155
  examples=[
156
  ["Hello!", "Azul!"],
@@ -161,54 +183,89 @@ with gr.Blocks(title="Kabyle Semantic Toolkit") as demo:
161
  inputs=[en_input, kab_input],
162
  label="Try these examples"
163
  )
164
-
 
 
 
 
 
165
  # Tab 2: Similar Search
166
  with gr.TabItem("Similar Sentences"):
167
- gr.Markdown("Find Kabyle sentences similar to your query.")
168
-
169
- query_input = gr.Textbox(
170
- label="Query (English or Kabyle)",
171
- placeholder="Enter text to search..."
172
- )
173
- top_k_slider = gr.Slider(1, 10, value=5, step=1, label="Number of results")
174
- search_btn = gr.Button("Search", variant="primary")
175
- search_output = gr.Textbox(label="Results", lines=10, interactive=False)
176
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  search_btn.click(
178
  fn=search_similar,
179
  inputs=[query_input, top_k_slider],
180
  outputs=search_output
181
  )
182
-
183
  gr.Examples(
184
  examples=["How are you?", "Thank you", "Water is life"],
185
  inputs=query_input,
186
  label="Example queries"
187
  )
188
-
 
 
 
 
 
189
  # Tab 3: Data Validator
190
  with gr.TabItem("Data Validator"):
191
  gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.")
192
-
193
- file_input = gr.File(label="Upload CSV", file_types=[".csv"])
194
- validate_btn = gr.Button("Validate", variant="primary")
195
-
196
  with gr.Row():
197
- download_output = gr.File(label="Download Results")
198
- summary_output = gr.Textbox(label="Summary", lines=4, interactive=False)
199
-
 
 
 
 
 
 
 
 
 
 
 
 
200
  validate_btn.click(
201
  fn=validate_csv,
202
  inputs=file_input,
203
  outputs=[download_output, summary_output]
204
  )
205
-
206
  gr.Markdown("""
207
  ---
208
- **Related tools**:
209
- [LibreTranslate](https://imsidag-community-libretranslate-kabyle.hf.space/) |
210
  [MarianMT](https://huggingface.co/boffire/marianmt-en-kab)
211
  """)
212
 
213
  if __name__ == "__main__":
214
- demo.launch()
 
18
  MODEL = SentenceTransformer("boffire/kabyle-sentence-transformer-mpnet")
19
  print("Model loaded")
20
 
21
+ # Pre-load and pre-compute search index at startup
22
+ print("Pre-computing search index...")
23
  try:
24
  from datasets import load_dataset
25
  ds = load_dataset("Imsidag-community/english-kabyle-parallel", split="train")
26
+ SEARCH_PAIRS = [(row["en"], row["kab"]) for row in ds.select(range(min(500, len(ds))))]
27
+ except Exception as e:
28
+ print("Could not load dataset, using fallback: " + str(e))
29
  SEARCH_PAIRS = [
30
  ("Hello!", "Azul!"),
31
  ("How are you?", "Amek i telliḍ?"),
 
34
  ("Water is life", "Aman d tudert"),
35
  ]
36
 
37
+ # Pre-compute embeddings once at startup
38
+ _all_texts = [en for en, _ in SEARCH_PAIRS] + [kab for _, kab in SEARCH_PAIRS]
39
+ SEARCH_EMBEDDINGS = MODEL.encode(_all_texts, convert_to_tensor=True, show_progress_bar=False)
40
+ print("Search index ready: " + str(len(SEARCH_PAIRS)) + " pairs")
41
 
42
  def get_embeddings(texts):
43
  return MODEL.encode(texts, convert_to_tensor=True)
 
46
  """Tab 1: Translation Quality Checker"""
47
  if not en_text.strip() or not kab_text.strip():
48
  return "Please enter both sentences", None
49
+
50
  emb = get_embeddings([en_text, kab_text])
51
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
52
+
53
  if sim > 0.85:
54
  quality = "Excellent match"
55
  elif sim > 0.6:
56
  quality = "Good match"
57
  else:
58
  quality = "Poor match"
59
+
60
+ result = "Similarity: " + str(round(sim, 4)) + "
61
+ Quality: " + quality
62
  return result, sim
63
 
64
  def search_similar(query, top_k=5):
65
+ """Tab 2: Semantic Search - fast because embeddings are pre-computed"""
 
 
66
  if not query.strip():
67
  return "Please enter a query"
68
+
 
 
 
 
69
  query_emb = get_embeddings([query])
70
+
71
  # Search both English and Kabyle sides
72
  scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy()
73
  top_indices = np.argsort(scores)[::-1][:top_k]
74
+
75
  results = []
76
  seen = set()
77
  for idx in top_indices:
 
79
  pair = SEARCH_PAIRS[idx]
80
  else:
81
  pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)]
82
+
83
  key = pair[0] + " || " + pair[1]
84
  if key not in seen:
85
  seen.add(key)
86
+ results.append(pair[1] + "
87
+ (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4)))
88
+
89
+ return "
90
+
91
+ ".join(results) if results else "No results found"
92
 
93
  def validate_csv(file):
94
  """Tab 3: Parallel Data Validator"""
95
  if file is None:
96
  return None, "Please upload a CSV file with 'en' and 'kab' columns"
97
+
98
  df = pd.read_csv(file.name)
99
  if "en" not in df.columns or "kab" not in df.columns:
100
  return None, "CSV must have 'en' and 'kab' columns"
101
+
102
  scores = []
103
  for _, row in df.iterrows():
104
  emb = get_embeddings([str(row["en"]), str(row["kab"])])
105
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
106
  scores.append(sim)
107
+
108
  df["similarity"] = scores
109
  df["quality"] = df["similarity"].apply(
110
  lambda s: "good" if s > 0.6 else "poor"
111
  )
112
+
113
  # Save result
114
  output_path = "/tmp/validated_pairs.csv"
115
  df.to_csv(output_path, index=False)
116
+
117
+ summary = "Processed " + str(len(df)) + " pairs
118
+ "
119
+ summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + "
120
+ "
121
  summary += "Poor quality: " + str(len(df[df["quality"]=="poor"]))
122
+
123
  return output_path, summary
124
 
125
+ # Build UI with Soft theme
126
+ with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
127
  gr.Markdown("""
128
  # Kabyle Semantic Toolkit
129
+
130
+ Powered by [**boffire/kabyle-sentence-transformer-mpnet**](https://huggingface.co/boffire/kabyle-sentence-transformer-mpnet)
131
+
132
+ This tool understands meaning, not just words. Use it to check translations,
133
  find similar sentences, or validate your parallel data.
134
  """)
135
+
136
  with gr.Tabs():
137
+
138
  # Tab 1: Quality Checker
139
  with gr.TabItem("Translation Quality"):
140
  gr.Markdown("Check if an English-Kabyle pair has similar meaning.")
141
+
142
  with gr.Row():
143
+ with gr.Column(scale=2):
144
+ en_input = gr.Textbox(
145
+ label="English",
146
+ placeholder="Enter English text...",
147
+ lines=3
148
+ )
149
+ kab_input = gr.Textbox(
150
+ label="Kabyle",
151
+ placeholder="Enter Kabyle text...",
152
+ lines=3
153
+ )
154
+ with gr.Row():
155
+ clear_btn_1 = gr.Button("Clear", variant="secondary")
156
+ check_btn = gr.Button("Check Quality", variant="primary")
157
+
158
+ with gr.Column(scale=3):
159
+ result_text = gr.Textbox(
160
+ label="Result",
161
+ lines=3,
162
+ interactive=False
163
+ )
164
+ score_bar = gr.Slider(
165
+ 0, 1,
166
+ label="Similarity Score",
167
+ interactive=False
168
+ )
169
+
170
  check_btn.click(
171
  fn=check_quality,
172
  inputs=[en_input, kab_input],
173
  outputs=[result_text, score_bar]
174
  )
175
+
176
  gr.Examples(
177
  examples=[
178
  ["Hello!", "Azul!"],
 
183
  inputs=[en_input, kab_input],
184
  label="Try these examples"
185
  )
186
+
187
+ clear_btn_1.click(
188
+ fn=lambda: ("", "", "", None),
189
+ outputs=[en_input, kab_input, result_text, score_bar]
190
+ )
191
+
192
  # Tab 2: Similar Search
193
  with gr.TabItem("Similar Sentences"):
194
+ gr.Markdown("Find Kabyle sentences similar to your query. Search index is pre-loaded for instant results.")
195
+
196
+ with gr.Row():
197
+ with gr.Column(scale=2):
198
+ query_input = gr.Textbox(
199
+ label="Query (English or Kabyle)",
200
+ placeholder="Enter text to search...",
201
+ lines=3
202
+ )
203
+ top_k_slider = gr.Slider(
204
+ 1, 10,
205
+ value=5,
206
+ step=1,
207
+ label="Number of results"
208
+ )
209
+ with gr.Row():
210
+ clear_btn_2 = gr.Button("Clear", variant="secondary")
211
+ search_btn = gr.Button("Search", variant="primary")
212
+
213
+ with gr.Column(scale=3):
214
+ search_output = gr.Textbox(
215
+ label="Results",
216
+ lines=10,
217
+ interactive=False
218
+ )
219
+
220
  search_btn.click(
221
  fn=search_similar,
222
  inputs=[query_input, top_k_slider],
223
  outputs=search_output
224
  )
225
+
226
  gr.Examples(
227
  examples=["How are you?", "Thank you", "Water is life"],
228
  inputs=query_input,
229
  label="Example queries"
230
  )
231
+
232
+ clear_btn_2.click(
233
+ fn=lambda: ("", 5, ""),
234
+ outputs=[query_input, top_k_slider, search_output]
235
+ )
236
+
237
  # Tab 3: Data Validator
238
  with gr.TabItem("Data Validator"):
239
  gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.")
240
+
 
 
 
241
  with gr.Row():
242
+ with gr.Column(scale=2):
243
+ file_input = gr.File(
244
+ label="Upload CSV",
245
+ file_types=[".csv"]
246
+ )
247
+ validate_btn = gr.Button("Validate", variant="primary")
248
+
249
+ with gr.Column(scale=3):
250
+ summary_output = gr.Textbox(
251
+ label="Summary",
252
+ lines=4,
253
+ interactive=False
254
+ )
255
+ download_output = gr.File(label="Download Results")
256
+
257
  validate_btn.click(
258
  fn=validate_csv,
259
  inputs=file_input,
260
  outputs=[download_output, summary_output]
261
  )
262
+
263
  gr.Markdown("""
264
  ---
265
+ **Related tools**:
266
+ [LibreTranslate](https://imsidag-community-libretranslate-kabyle.hf.space/) |
267
  [MarianMT](https://huggingface.co/boffire/marianmt-en-kab)
268
  """)
269
 
270
  if __name__ == "__main__":
271
+ demo.launch()