boffire commited on
Commit
7a06b89
·
verified ·
1 Parent(s): 243e89b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -47
app.py CHANGED
@@ -12,6 +12,7 @@ import numpy as np
12
  import pandas as pd
13
  from sentence_transformers import SentenceTransformer
14
  import torch.nn.functional as F
 
15
 
16
  # Load model once
17
  print("Loading model...")
@@ -46,32 +47,31 @@ def check_quality(en_text, kab_text):
46
  """Tab 1: Translation Quality Checker"""
47
  if not en_text.strip() or not kab_text.strip():
48
  return "Please enter both sentences", None
49
-
50
  emb = get_embeddings([en_text, kab_text])
51
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
52
-
53
  if sim > 0.85:
54
  quality = "Excellent match"
55
  elif sim > 0.6:
56
  quality = "Good match"
57
  else:
58
  quality = "Poor match"
59
-
60
- result = "Similarity: " + str(round(sim, 4)) + "
61
- Quality: " + quality
62
  return result, sim
63
 
64
  def search_similar(query, top_k=5):
65
  """Tab 2: Semantic Search - fast because embeddings are pre-computed"""
66
  if not query.strip():
67
  return "Please enter a query"
68
-
69
  query_emb = get_embeddings([query])
70
-
71
  # Search both English and Kabyle sides
72
  scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy()
73
  top_indices = np.argsort(scores)[::-1][:top_k]
74
-
75
  results = []
76
  seen = set()
77
  for idx in top_indices:
@@ -79,66 +79,61 @@ def search_similar(query, top_k=5):
79
  pair = SEARCH_PAIRS[idx]
80
  else:
81
  pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)]
82
-
83
  key = pair[0] + " || " + pair[1]
84
  if key not in seen:
85
  seen.add(key)
86
- results.append(pair[1] + "
87
- (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4)))
88
-
89
- return "
90
-
91
- ".join(results) if results else "No results found"
92
 
93
  def validate_csv(file):
94
  """Tab 3: Parallel Data Validator"""
95
  if file is None:
96
  return None, "Please upload a CSV file with 'en' and 'kab' columns"
97
-
98
  df = pd.read_csv(file.name)
99
  if "en" not in df.columns or "kab" not in df.columns:
100
  return None, "CSV must have 'en' and 'kab' columns"
101
-
102
  scores = []
103
  for _, row in df.iterrows():
104
  emb = get_embeddings([str(row["en"]), str(row["kab"])])
105
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
106
  scores.append(sim)
107
-
108
  df["similarity"] = scores
109
  df["quality"] = df["similarity"].apply(
110
  lambda s: "good" if s > 0.6 else "poor"
111
  )
112
-
113
  # Save result
114
  output_path = "/tmp/validated_pairs.csv"
115
  df.to_csv(output_path, index=False)
116
-
117
- summary = "Processed " + str(len(df)) + " pairs
118
- "
119
- summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + "
120
- "
121
  summary += "Poor quality: " + str(len(df[df["quality"]=="poor"]))
122
-
123
  return output_path, summary
124
 
125
  # Build UI with Soft theme
126
  with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
127
  gr.Markdown("""
128
  # Kabyle Semantic Toolkit
129
-
130
  Powered by [**boffire/kabyle-sentence-transformer-mpnet**](https://huggingface.co/boffire/kabyle-sentence-transformer-mpnet)
131
-
132
  This tool understands meaning, not just words. Use it to check translations,
133
  find similar sentences, or validate your parallel data.
134
  """)
135
-
136
  with gr.Tabs():
137
-
138
  # Tab 1: Quality Checker
139
  with gr.TabItem("Translation Quality"):
140
  gr.Markdown("Check if an English-Kabyle pair has similar meaning.")
141
-
142
  with gr.Row():
143
  with gr.Column(scale=2):
144
  en_input = gr.Textbox(
@@ -154,7 +149,7 @@ with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
154
  with gr.Row():
155
  clear_btn_1 = gr.Button("Clear", variant="secondary")
156
  check_btn = gr.Button("Check Quality", variant="primary")
157
-
158
  with gr.Column(scale=3):
159
  result_text = gr.Textbox(
160
  label="Result",
@@ -166,33 +161,33 @@ with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
166
  label="Similarity Score",
167
  interactive=False
168
  )
169
-
170
  check_btn.click(
171
  fn=check_quality,
172
  inputs=[en_input, kab_input],
173
  outputs=[result_text, score_bar]
174
  )
175
-
176
  gr.Examples(
177
  examples=[
178
  ["Hello!", "Azul!"],
179
  ["The computer works.", "Aselkim iteddu."],
180
- ["I love you.", "Hemmleɣ-k."],
181
  ["Hello!", "Aselkim iteddu."],
182
  ],
183
  inputs=[en_input, kab_input],
184
  label="Try these examples"
185
  )
186
-
187
  clear_btn_1.click(
188
  fn=lambda: ("", "", "", None),
189
  outputs=[en_input, kab_input, result_text, score_bar]
190
  )
191
-
192
  # Tab 2: Similar Search
193
  with gr.TabItem("Similar Sentences"):
194
  gr.Markdown("Find Kabyle sentences similar to your query. Search index is pre-loaded for instant results.")
195
-
196
  with gr.Row():
197
  with gr.Column(scale=2):
198
  query_input = gr.Textbox(
@@ -209,35 +204,35 @@ with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
209
  with gr.Row():
210
  clear_btn_2 = gr.Button("Clear", variant="secondary")
211
  search_btn = gr.Button("Search", variant="primary")
212
-
213
  with gr.Column(scale=3):
214
  search_output = gr.Textbox(
215
  label="Results",
216
  lines=10,
217
  interactive=False
218
  )
219
-
220
  search_btn.click(
221
  fn=search_similar,
222
  inputs=[query_input, top_k_slider],
223
  outputs=search_output
224
  )
225
-
226
  gr.Examples(
227
  examples=["How are you?", "Thank you", "Water is life"],
228
  inputs=query_input,
229
  label="Example queries"
230
  )
231
-
232
  clear_btn_2.click(
233
  fn=lambda: ("", 5, ""),
234
  outputs=[query_input, top_k_slider, search_output]
235
  )
236
-
237
  # Tab 3: Data Validator
238
  with gr.TabItem("Data Validator"):
239
  gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.")
240
-
241
  with gr.Row():
242
  with gr.Column(scale=2):
243
  file_input = gr.File(
@@ -245,7 +240,7 @@ with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
245
  file_types=[".csv"]
246
  )
247
  validate_btn = gr.Button("Validate", variant="primary")
248
-
249
  with gr.Column(scale=3):
250
  summary_output = gr.Textbox(
251
  label="Summary",
@@ -253,13 +248,13 @@ with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
253
  interactive=False
254
  )
255
  download_output = gr.File(label="Download Results")
256
-
257
  validate_btn.click(
258
  fn=validate_csv,
259
  inputs=file_input,
260
  outputs=[download_output, summary_output]
261
  )
262
-
263
  gr.Markdown("""
264
  ---
265
  **Related tools**:
@@ -268,4 +263,4 @@ with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
268
  """)
269
 
270
  if __name__ == "__main__":
271
- demo.launch()
 
12
  import pandas as pd
13
  from sentence_transformers import SentenceTransformer
14
  import torch.nn.functional as F
15
+ import os
16
 
17
  # Load model once
18
  print("Loading model...")
 
47
  """Tab 1: Translation Quality Checker"""
48
  if not en_text.strip() or not kab_text.strip():
49
  return "Please enter both sentences", None
50
+
51
  emb = get_embeddings([en_text, kab_text])
52
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
53
+
54
  if sim > 0.85:
55
  quality = "Excellent match"
56
  elif sim > 0.6:
57
  quality = "Good match"
58
  else:
59
  quality = "Poor match"
60
+
61
+ result = "Similarity: " + str(round(sim, 4)) + os.linesep + "Quality: " + quality
 
62
  return result, sim
63
 
64
  def search_similar(query, top_k=5):
65
  """Tab 2: Semantic Search - fast because embeddings are pre-computed"""
66
  if not query.strip():
67
  return "Please enter a query"
68
+
69
  query_emb = get_embeddings([query])
70
+
71
  # Search both English and Kabyle sides
72
  scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy()
73
  top_indices = np.argsort(scores)[::-1][:top_k]
74
+
75
  results = []
76
  seen = set()
77
  for idx in top_indices:
 
79
  pair = SEARCH_PAIRS[idx]
80
  else:
81
  pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)]
82
+
83
  key = pair[0] + " || " + pair[1]
84
  if key not in seen:
85
  seen.add(key)
86
+ results.append(pair[1] + os.linesep + " (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4)))
87
+
88
+ return (os.linesep + os.linesep).join(results) if results else "No results found"
 
 
 
89
 
90
  def validate_csv(file):
91
  """Tab 3: Parallel Data Validator"""
92
  if file is None:
93
  return None, "Please upload a CSV file with 'en' and 'kab' columns"
94
+
95
  df = pd.read_csv(file.name)
96
  if "en" not in df.columns or "kab" not in df.columns:
97
  return None, "CSV must have 'en' and 'kab' columns"
98
+
99
  scores = []
100
  for _, row in df.iterrows():
101
  emb = get_embeddings([str(row["en"]), str(row["kab"])])
102
  sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item()
103
  scores.append(sim)
104
+
105
  df["similarity"] = scores
106
  df["quality"] = df["similarity"].apply(
107
  lambda s: "good" if s > 0.6 else "poor"
108
  )
109
+
110
  # Save result
111
  output_path = "/tmp/validated_pairs.csv"
112
  df.to_csv(output_path, index=False)
113
+
114
+ summary = "Processed " + str(len(df)) + " pairs" + os.linesep
115
+ summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + os.linesep
 
 
116
  summary += "Poor quality: " + str(len(df[df["quality"]=="poor"]))
117
+
118
  return output_path, summary
119
 
120
  # Build UI with Soft theme
121
  with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo:
122
  gr.Markdown("""
123
  # Kabyle Semantic Toolkit
124
+
125
  Powered by [**boffire/kabyle-sentence-transformer-mpnet**](https://huggingface.co/boffire/kabyle-sentence-transformer-mpnet)
126
+
127
  This tool understands meaning, not just words. Use it to check translations,
128
  find similar sentences, or validate your parallel data.
129
  """)
130
+
131
  with gr.Tabs():
132
+
133
  # Tab 1: Quality Checker
134
  with gr.TabItem("Translation Quality"):
135
  gr.Markdown("Check if an English-Kabyle pair has similar meaning.")
136
+
137
  with gr.Row():
138
  with gr.Column(scale=2):
139
  en_input = gr.Textbox(
 
149
  with gr.Row():
150
  clear_btn_1 = gr.Button("Clear", variant="secondary")
151
  check_btn = gr.Button("Check Quality", variant="primary")
152
+
153
  with gr.Column(scale=3):
154
  result_text = gr.Textbox(
155
  label="Result",
 
161
  label="Similarity Score",
162
  interactive=False
163
  )
164
+
165
  check_btn.click(
166
  fn=check_quality,
167
  inputs=[en_input, kab_input],
168
  outputs=[result_text, score_bar]
169
  )
170
+
171
  gr.Examples(
172
  examples=[
173
  ["Hello!", "Azul!"],
174
  ["The computer works.", "Aselkim iteddu."],
175
+ ["I love you.", "Hemmleɣ-kent."],
176
  ["Hello!", "Aselkim iteddu."],
177
  ],
178
  inputs=[en_input, kab_input],
179
  label="Try these examples"
180
  )
181
+
182
  clear_btn_1.click(
183
  fn=lambda: ("", "", "", None),
184
  outputs=[en_input, kab_input, result_text, score_bar]
185
  )
186
+
187
  # Tab 2: Similar Search
188
  with gr.TabItem("Similar Sentences"):
189
  gr.Markdown("Find Kabyle sentences similar to your query. Search index is pre-loaded for instant results.")
190
+
191
  with gr.Row():
192
  with gr.Column(scale=2):
193
  query_input = gr.Textbox(
 
204
  with gr.Row():
205
  clear_btn_2 = gr.Button("Clear", variant="secondary")
206
  search_btn = gr.Button("Search", variant="primary")
207
+
208
  with gr.Column(scale=3):
209
  search_output = gr.Textbox(
210
  label="Results",
211
  lines=10,
212
  interactive=False
213
  )
214
+
215
  search_btn.click(
216
  fn=search_similar,
217
  inputs=[query_input, top_k_slider],
218
  outputs=search_output
219
  )
220
+
221
  gr.Examples(
222
  examples=["How are you?", "Thank you", "Water is life"],
223
  inputs=query_input,
224
  label="Example queries"
225
  )
226
+
227
  clear_btn_2.click(
228
  fn=lambda: ("", 5, ""),
229
  outputs=[query_input, top_k_slider, search_output]
230
  )
231
+
232
  # Tab 3: Data Validator
233
  with gr.TabItem("Data Validator"):
234
  gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.")
235
+
236
  with gr.Row():
237
  with gr.Column(scale=2):
238
  file_input = gr.File(
 
240
  file_types=[".csv"]
241
  )
242
  validate_btn = gr.Button("Validate", variant="primary")
243
+
244
  with gr.Column(scale=3):
245
  summary_output = gr.Textbox(
246
  label="Summary",
 
248
  interactive=False
249
  )
250
  download_output = gr.File(label="Download Results")
251
+
252
  validate_btn.click(
253
  fn=validate_csv,
254
  inputs=file_input,
255
  outputs=[download_output, summary_output]
256
  )
257
+
258
  gr.Markdown("""
259
  ---
260
  **Related tools**:
 
263
  """)
264
 
265
  if __name__ == "__main__":
266
+ demo.launch()