Phong1 commited on
Commit
c71607e
·
verified ·
1 Parent(s): f36baeb

Upload full_interface.py

Browse files
Files changed (1) hide show
  1. full_interface.py +809 -0
full_interface.py ADDED
@@ -0,0 +1,809 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import os
4
+ import tempfile
5
+ import unicodedata
6
+ from io import BytesIO
7
+ from pathlib import Path
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+ import requests
12
+ from openpyxl import load_workbook
13
+
14
+
15
+ API_BASE_URL = os.getenv("MULTIMODAL_API_BASE_URL", "http://127.0.0.1:7861")
16
+ EXTRACTION_API_URL = os.getenv(
17
+ "MULTIMODAL_API_URL",
18
+ f"{API_BASE_URL.rstrip('/')}/information_extraction/",
19
+ )
20
+ MAPPING_API_URL = os.getenv(
21
+ "MULTIMODAL_MAPPING_API_URL",
22
+ f"{API_BASE_URL.rstrip('/')}/mapping/",
23
+ )
24
+
25
+ RAW_DATA_DIR = Path("raw_data")
26
+ CATALOG_XLSX_PATH = RAW_DATA_DIR / "product_names_dms_10022026.xlsx"
27
+ CATALOG_JSON_PATH = RAW_DATA_DIR / "product_catalog_ui.json"
28
+
29
+ PRODUCT_COLUMN = "Tên sản phẩm"
30
+ ROW_ID_COLUMN = "__row_id__"
31
+ ORIGINAL_PRODUCT_COLUMN = "__original_product_name__"
32
+
33
+ CATALOG_CACHE = None
34
+ CUSTOM_CSS = """
35
+ /* Container Background */
36
+ .gradio-container {
37
+ background-color: #f8fafc !important;
38
+ font-family: 'Inter', -apple-system, sans-serif !important;
39
+ }
40
+
41
+ /* Global Font Size Increase */
42
+ div, p, label, span, input, table, .text-gray-500 {
43
+ font-size: 16px !important;
44
+ }
45
+
46
+ /* FORCE BRIGHTNESS ON FORM BLOCKS (Employee Code, Extraction Status, etc.) */
47
+ .gr-box, .gr-form, .gr-input, .gr-padded, .type-row, div[class*="form"], div[class*="block"] {
48
+ background-color: #ffffff !important;
49
+ border-color: #e2e8f0 !important;
50
+ color: #1e293b !important;
51
+ }
52
+
53
+ /* Fix for specific labels and text inside dark areas */
54
+ label span, .text-gray-500, p, .prose {
55
+ color: #334155 !important;
56
+ font-weight: 600 !important;
57
+ }
58
+
59
+ /* Headers - Larger and Emphasized */
60
+ h1 {
61
+ font-size: 2.5rem !important;
62
+ color: #0f172a !important;
63
+ border-bottom: 4px solid #2563eb;
64
+ padding-bottom: 12px;
65
+ }
66
+
67
+ h2 {
68
+ font-size: 1.8rem !important;
69
+ color: #1e293b !important;
70
+ border-left: 6px solid #2563eb;
71
+ padding-left: 15px;
72
+ margin-top: 25px !important;
73
+ }
74
+
75
+ /* --- TECHNOLOGY COLORS --- */
76
+
77
+ /* Primary Action (Extract/Matching) - Professional Blue */
78
+ button.primary {
79
+ background: #2563eb !important;
80
+ font-size: 18px !important;
81
+ font-weight: bold !important;
82
+ color: white !important;
83
+ border-radius: 8px !important;
84
+ border: none !important;
85
+ }
86
+
87
+ /* Success Green (Finish/Done/Apply) */
88
+ button:contains("Finish"), button:contains("Done"), button:contains("Apply") {
89
+ background: #16a34a !important;
90
+ color: white !important;
91
+ border: none !important;
92
+ font-weight: 700 !important;
93
+ }
94
+
95
+ /* Alert Red (Delete/Undo) */
96
+ button:contains("Delete"), button:contains("Undo") {
97
+ background: #fee2e2 !important;
98
+ border: 1px solid #dc2626 !important;
99
+ color: #dc2626 !important;
100
+ font-weight: 600 !important;
101
+ }
102
+
103
+ /* Textbox & Input Specific Fix */
104
+ input, textarea, .dropdown, select {
105
+ background-color: #ffffff !important;
106
+ color: #0f172a !important; /* Deep dark text */
107
+ border: 2px solid #cbd5e1 !important;
108
+ border-radius: 8px !important;
109
+ }
110
+
111
+ /* Fix for Status Boxes that stay dark */
112
+ div[data-testid="block-info"], .status-text {
113
+ background-color: #f1f5f9 !important;
114
+ color: #1e293b !important;
115
+ border: 1px solid #e2e8f0 !important;
116
+ }
117
+ """
118
+
119
+ # Force Light Mode explicitly in the theme
120
+ CUSTOM_THEME = gr.themes.Soft(
121
+ primary_hue="blue",
122
+ secondary_hue="slate",
123
+ ).set(
124
+ body_background_fill="*neutral_50",
125
+ block_background_fill="white",
126
+ block_label_text_color="*neutral_900"
127
+ )
128
+
129
+ def normalize_text(value: str) -> str:
130
+ text = str(value or "").strip().lower()
131
+ if not text:
132
+ return ""
133
+ text = unicodedata.normalize("NFKD", text)
134
+ text = "".join(ch for ch in text if not unicodedata.combining(ch))
135
+ return " ".join(text.split())
136
+
137
+
138
+ def build_default_state() -> dict:
139
+ return {
140
+ "df_ocr": None,
141
+ "df_mapped": None,
142
+ "mapping_cache": {},
143
+ "selected_row_id": None,
144
+ "selected_product": "",
145
+ "done_row_ids": [],
146
+ }
147
+
148
+
149
+ def visible_dataframe(df: pd.DataFrame | None) -> pd.DataFrame:
150
+ if df is None:
151
+ return pd.DataFrame()
152
+ return df.drop(columns=[ROW_ID_COLUMN, ORIGINAL_PRODUCT_COLUMN], errors="ignore")
153
+
154
+
155
+ def prepare_working_dataframe(df: pd.DataFrame) -> pd.DataFrame:
156
+ prepared = df.copy()
157
+ prepared.insert(0, ROW_ID_COLUMN, list(range(len(prepared))))
158
+ prepared[ORIGINAL_PRODUCT_COLUMN] = prepared.get(PRODUCT_COLUMN, "").fillna("").astype(str)
159
+ return prepared
160
+
161
+
162
+ def save_dataframe_to_excel(df: pd.DataFrame, prefix: str) -> str:
163
+ export_df = visible_dataframe(df)
164
+ output_path = os.path.join(tempfile.gettempdir(), f"{prefix}.xlsx")
165
+ export_df.to_excel(output_path, index=False)
166
+ return output_path
167
+
168
+
169
+ def decode_excel_base64(excel_base64: str) -> pd.DataFrame:
170
+ excel_bytes = base64.b64decode(excel_base64)
171
+ return pd.read_excel(BytesIO(excel_bytes))
172
+
173
+
174
+ def request_mapping_api(product_names: list[str]) -> dict:
175
+ response = requests.post(
176
+ MAPPING_API_URL,
177
+ data={
178
+ "product_list": "\n".join(product_names),
179
+ "dense_weight": 0.7,
180
+ "sparse_weight": 0.3,
181
+ "normalize": "true",
182
+ },
183
+ timeout=300,
184
+ )
185
+
186
+ try:
187
+ payload = response.json()
188
+ except ValueError as exc:
189
+ raise gr.Error("The mapping API returned an invalid response.") from exc
190
+
191
+ if response.status_code != 200:
192
+ detail = payload.get("detail") or payload.get("message") or "Mapping request failed."
193
+ raise gr.Error(detail)
194
+
195
+ if payload.get("status") != "success":
196
+ detail = payload.get("detail") or payload.get("message") or "Mapping request failed."
197
+ raise gr.Error(detail)
198
+
199
+ return payload
200
+
201
+
202
+ def extract_mapping_cache(payload: dict) -> dict[str, list[str]]:
203
+ cache: dict[str, list[str]] = {}
204
+ for item in payload.get("results", []):
205
+ original_name = str(item.get("original_product_name", "")).strip()
206
+ candidates = [
207
+ str(candidate.get("product", "")).strip()
208
+ for candidate in item.get("top_candidates", [])
209
+ if str(candidate.get("product", "")).strip()
210
+ ]
211
+ if original_name:
212
+ cache[original_name] = candidates[:5]
213
+ return cache
214
+
215
+
216
+ def ensure_catalog_json() -> Path:
217
+ if CATALOG_JSON_PATH.exists():
218
+ return CATALOG_JSON_PATH
219
+
220
+ if not CATALOG_XLSX_PATH.exists():
221
+ raise FileNotFoundError(f"Missing catalog file: {CATALOG_XLSX_PATH}")
222
+
223
+ workbook = load_workbook(CATALOG_XLSX_PATH, read_only=True, data_only=True)
224
+ sheet = workbook[workbook.sheetnames[0]]
225
+ header_row = next(sheet.iter_rows(min_row=1, max_row=1, values_only=True))
226
+ headers = [str(cell or "").strip() for cell in header_row]
227
+ normalized_headers = [normalize_text(header) for header in headers]
228
+
229
+ def get_value(row_values: tuple, target_options: list[str]) -> str:
230
+ for option in target_options:
231
+ if option in normalized_headers:
232
+ idx = normalized_headers.index(option)
233
+ value = row_values[idx]
234
+ if value is not None and str(value).strip():
235
+ return str(value).strip()
236
+ return ""
237
+
238
+ catalog_records = []
239
+ for row in sheet.iter_rows(min_row=2, values_only=True):
240
+ dms_name = get_value(row, ["ten san pham dms"])
241
+ normalized_name = get_value(row, ["ten san pham chuan hoa tu rangdong.com.vn"])
242
+ product_name = normalized_name or dms_name
243
+ if not product_name:
244
+ continue
245
+
246
+ tree_parts = [
247
+ get_value(row, ["category 1"]),
248
+ get_value(row, ["category 2"]),
249
+ get_value(row, ["category 3"]),
250
+ get_value(row, ["l1"]),
251
+ get_value(row, ["l2"]),
252
+ ]
253
+ tree = " > ".join(part for part in tree_parts if part)
254
+
255
+ catalog_records.append(
256
+ {
257
+ "product_name": product_name,
258
+ "tree": tree,
259
+ "search_blob": normalize_text(f"{product_name} {tree}"),
260
+ }
261
+ )
262
+
263
+ CATALOG_JSON_PATH.write_text(json.dumps(catalog_records, ensure_ascii=False), encoding="utf-8")
264
+ return CATALOG_JSON_PATH
265
+
266
+
267
+ def load_catalog_records() -> list[dict]:
268
+ global CATALOG_CACHE
269
+
270
+ if CATALOG_CACHE is not None:
271
+ return CATALOG_CACHE
272
+
273
+ catalog_path = ensure_catalog_json()
274
+ CATALOG_CACHE = json.loads(catalog_path.read_text(encoding="utf-8"))
275
+ return CATALOG_CACHE
276
+
277
+
278
+ def build_catalog_choices(query: str, limit: int = 50) -> tuple[list[tuple[str, str]], str]:
279
+ normalized_query = normalize_text(query)
280
+ if not normalized_query:
281
+ return [], "Enter a search term to search the full catalog."
282
+
283
+ matches_starts = []
284
+ matches_contains = []
285
+ for record in load_catalog_records():
286
+ search_blob = record["search_blob"]
287
+ if normalized_query == search_blob:
288
+ label = record["product_name"]
289
+ if record["tree"]:
290
+ label = f"{label} | {record['tree']}"
291
+ return [(label, record["product_name"])], "Found an exact catalog match."
292
+
293
+ if search_blob.startswith(normalized_query):
294
+ matches_starts.append(record)
295
+ elif normalized_query in search_blob:
296
+ matches_contains.append(record)
297
+
298
+ matches = (matches_starts + matches_contains)[:limit]
299
+ choices = []
300
+ for record in matches:
301
+ label = record["product_name"]
302
+ if record["tree"]:
303
+ label = f"{label} | {record['tree']}"
304
+ choices.append((label, record["product_name"]))
305
+
306
+ message = f"Found {len(choices)} catalog matches."
307
+ if not choices:
308
+ message = "No catalog matches found."
309
+ return choices, message
310
+
311
+
312
+ def get_row_index_by_id(df: pd.DataFrame, row_id: int) -> int:
313
+ matched = df.index[df[ROW_ID_COLUMN] == row_id].tolist()
314
+ if not matched:
315
+ raise gr.Error("The selected row no longer exists.")
316
+ return matched[0]
317
+
318
+
319
+ def require_dataframe(state: dict, key: str) -> pd.DataFrame:
320
+ df = state.get(key)
321
+ if df is None:
322
+ raise gr.Error("No data is available for this action.")
323
+ return df
324
+
325
+
326
+ def process_extraction(zip_file: str, employee_code: str, debug: bool, state: dict):
327
+ if not zip_file:
328
+ raise gr.Error("Please upload a ZIP file.")
329
+ if not employee_code or not employee_code.strip():
330
+ raise gr.Error("Please enter an employee code.")
331
+
332
+ with open(zip_file, "rb") as file_obj:
333
+ response = requests.post(
334
+ EXTRACTION_API_URL,
335
+ files={
336
+ "file": (os.path.basename(zip_file), file_obj, "application/zip"),
337
+ },
338
+ data={
339
+ "employee_code": employee_code.strip(),
340
+ "debug": str(debug).lower(),
341
+ },
342
+ timeout=300,
343
+ )
344
+
345
+ try:
346
+ payload = response.json()
347
+ except ValueError as exc:
348
+ raise gr.Error("The extraction API returned an invalid response.") from exc
349
+
350
+ if response.status_code != 200:
351
+ detail = payload.get("detail") or payload.get("message") or "Extraction request failed."
352
+ raise gr.Error(detail)
353
+
354
+ excel_base64 = payload.get("excel_data_base64")
355
+ if not excel_base64:
356
+ raise gr.Error("The extraction API did not return an Excel file.")
357
+
358
+ df_ocr = decode_excel_base64(excel_base64)
359
+ if PRODUCT_COLUMN not in df_ocr.columns:
360
+ raise gr.Error(f'The extraction result does not contain the "{PRODUCT_COLUMN}" column.')
361
+
362
+ extraction_download = os.path.join(
363
+ tempfile.gettempdir(),
364
+ f"df_ocr_{employee_code.strip()}.xlsx",
365
+ )
366
+ with open(extraction_download, "wb") as output_file:
367
+ output_file.write(base64.b64decode(excel_base64))
368
+
369
+ new_state = build_default_state()
370
+ new_state["df_ocr"] = prepare_working_dataframe(df_ocr)
371
+ new_state["df_mapped"] = prepare_working_dataframe(df_ocr)
372
+
373
+ status = (
374
+ f"{payload.get('message', 'Extraction completed.')} "
375
+ f"Duration: {payload.get('duration', 'N/A')}s."
376
+ )
377
+
378
+ return (
379
+ new_state,
380
+ status,
381
+ visible_dataframe(new_state["df_ocr"]),
382
+ extraction_download,
383
+ "Extraction ready. Click Product Matching to prepare the mapping workspace.",
384
+ pd.DataFrame(),
385
+ "Click a product name in the mapped table to review candidates.",
386
+ gr.update(choices=[], value=None),
387
+ gr.update(choices=[], value=None),
388
+ "",
389
+ "",
390
+ None,
391
+ )
392
+
393
+
394
+ def run_product_matching(state: dict):
395
+ df_ocr = require_dataframe(state, "df_ocr")
396
+
397
+ product_names = []
398
+ seen = set()
399
+ for value in df_ocr[PRODUCT_COLUMN].fillna("").astype(str):
400
+ name = value.strip()
401
+ if name and name not in seen:
402
+ seen.add(name)
403
+ product_names.append(name)
404
+
405
+ if not product_names:
406
+ raise gr.Error("No product names were found in the extracted file.")
407
+
408
+ payload = request_mapping_api(product_names)
409
+ mapping_cache = extract_mapping_cache(payload)
410
+
411
+ state["mapping_cache"].update(mapping_cache)
412
+
413
+ status = (
414
+ f"{payload.get('message', 'Product matching completed.')} "
415
+ f"Prepared suggestions for {len(product_names)} unique products. "
416
+ "Click any cell in the Tên sản phẩm column to review or refine the mapping."
417
+ )
418
+
419
+ return (
420
+ state,
421
+ status,
422
+ visible_dataframe(state["df_mapped"]),
423
+ "Click a product name in the mapped table to review candidates.",
424
+ gr.update(choices=[], value=None),
425
+ gr.update(choices=[], value=None),
426
+ "",
427
+ "",
428
+ None,
429
+ )
430
+
431
+
432
+ def handle_product_click(state: dict, evt: gr.SelectData):
433
+ df_mapped = require_dataframe(state, "df_mapped")
434
+ if df_mapped.empty:
435
+ raise gr.Error("The mapped table is empty.")
436
+ if evt.index is None or len(evt.index) != 2:
437
+ raise gr.Error("Please click a single cell in the mapped table.")
438
+
439
+ row_position, col_position = evt.index
440
+ visible_columns = list(visible_dataframe(df_mapped).columns)
441
+ selected_column = visible_columns[col_position]
442
+
443
+ if selected_column != PRODUCT_COLUMN:
444
+ return (
445
+ state,
446
+ f'Click inside the "{PRODUCT_COLUMN}" column to open the mapping tools.',
447
+ gr.update(choices=[], value=None),
448
+ gr.update(choices=[], value=None),
449
+ )
450
+
451
+ row_id = int(df_mapped.iloc[row_position][ROW_ID_COLUMN])
452
+ current_value = str(df_mapped.iloc[row_position][PRODUCT_COLUMN]).strip()
453
+ if not current_value:
454
+ raise gr.Error("The selected product cell is empty.")
455
+
456
+ suggestions = state["mapping_cache"].get(current_value)
457
+ if suggestions is None:
458
+ payload = request_mapping_api([current_value])
459
+ fresh_cache = extract_mapping_cache(payload)
460
+ state["mapping_cache"].update(fresh_cache)
461
+ suggestions = state["mapping_cache"].get(current_value, [])
462
+
463
+ state["selected_row_id"] = row_id
464
+ state["selected_product"] = current_value
465
+
466
+ editor_message = f"Row {row_position + 1}: reviewing `{current_value}`."
467
+ if suggestions:
468
+ editor_message += " Choose one of the top 5 suggestions or search the full catalog."
469
+ else:
470
+ editor_message += " No top-5 suggestions returned, so use the full catalog search."
471
+
472
+ return (
473
+ state,
474
+ editor_message,
475
+ gr.update(choices=suggestions, value=None),
476
+ gr.update(choices=[], value=None),
477
+ )
478
+
479
+
480
+ def search_full_catalog(query: str):
481
+ choices, status = build_catalog_choices(query)
482
+ return status, gr.update(choices=choices, value=None)
483
+
484
+
485
+ def apply_product_choice(state: dict, top5_choice: str, catalog_choice: str):
486
+ df_mapped = require_dataframe(state, "df_mapped")
487
+ if df_mapped.empty:
488
+ raise gr.Error("The mapped table is empty.")
489
+ row_id = state.get("selected_row_id")
490
+ if row_id is None:
491
+ raise gr.Error("Click a product name in the mapped table before applying a change.")
492
+
493
+ chosen_value = catalog_choice or top5_choice
494
+ if not chosen_value:
495
+ raise gr.Error("Select a replacement product before clicking Apply.")
496
+
497
+ row_index = get_row_index_by_id(df_mapped, row_id)
498
+ old_value = str(df_mapped.at[row_index, PRODUCT_COLUMN]).strip()
499
+ df_mapped.at[row_index, PRODUCT_COLUMN] = chosen_value
500
+ state["df_mapped"] = df_mapped
501
+ state["mapping_cache"].setdefault(chosen_value, [])
502
+
503
+ status = f"Updated row {row_index + 1}: `{old_value}` -> `{chosen_value}`."
504
+ return (
505
+ state,
506
+ visible_dataframe(df_mapped),
507
+ status,
508
+ gr.update(choices=state["mapping_cache"].get(chosen_value, []), value=None),
509
+ gr.update(value=None),
510
+ "",
511
+ "",
512
+ )
513
+
514
+
515
+ def undo_product_choice(state: dict):
516
+ df_mapped = require_dataframe(state, "df_mapped")
517
+ if df_mapped.empty:
518
+ raise gr.Error("The mapped table is empty.")
519
+ row_id = state.get("selected_row_id")
520
+ if row_id is None:
521
+ raise gr.Error("Click a product name in the mapped table before using Undo.")
522
+
523
+ row_index = get_row_index_by_id(df_mapped, row_id)
524
+ original_value = str(df_mapped.at[row_index, ORIGINAL_PRODUCT_COLUMN]).strip()
525
+ current_value = str(df_mapped.at[row_index, PRODUCT_COLUMN]).strip()
526
+ df_mapped.at[row_index, PRODUCT_COLUMN] = original_value
527
+ state["df_mapped"] = df_mapped
528
+ state["selected_product"] = original_value
529
+
530
+ status = f"Restored row {row_index + 1} to the original product `{original_value}`."
531
+ if current_value == original_value:
532
+ status = f"Row {row_index + 1} is already using the original product name."
533
+
534
+ suggestions = state["mapping_cache"].get(original_value, [])
535
+ return (
536
+ state,
537
+ visible_dataframe(df_mapped),
538
+ status,
539
+ gr.update(choices=suggestions, value=None),
540
+ gr.update(value=None),
541
+ "",
542
+ "",
543
+ )
544
+
545
+
546
+ def delete_selected_row(state: dict):
547
+ df_mapped = require_dataframe(state, "df_mapped")
548
+ if df_mapped.empty:
549
+ raise gr.Error("The mapped table is empty.")
550
+ row_id = state.get("selected_row_id")
551
+ if row_id is None:
552
+ raise gr.Error("Click a product name in the mapped table before deleting a row.")
553
+
554
+ row_index = get_row_index_by_id(df_mapped, row_id)
555
+ deleted_product = str(df_mapped.at[row_index, PRODUCT_COLUMN]).strip()
556
+ updated_df = df_mapped[df_mapped[ROW_ID_COLUMN] != row_id].reset_index(drop=True)
557
+ state["df_mapped"] = updated_df
558
+ state["selected_row_id"] = None
559
+ state["selected_product"] = ""
560
+ state["done_row_ids"] = [rid for rid in state["done_row_ids"] if rid != row_id]
561
+
562
+ status = f"Deleted row {row_index + 1} for product `{deleted_product}`."
563
+ return (
564
+ state,
565
+ visible_dataframe(updated_df),
566
+ status,
567
+ gr.update(choices=[], value=None),
568
+ gr.update(choices=[], value=None),
569
+ "",
570
+ "",
571
+ )
572
+
573
+
574
+ def mark_row_done(state: dict):
575
+ df_mapped = require_dataframe(state, "df_mapped")
576
+ if df_mapped.empty:
577
+ raise gr.Error("The mapped table is empty.")
578
+ row_id = state.get("selected_row_id")
579
+ if row_id is None:
580
+ raise gr.Error("Click a product name in the mapped table before marking it done.")
581
+
582
+ row_index = get_row_index_by_id(df_mapped, row_id)
583
+ current_value = str(df_mapped.at[row_index, PRODUCT_COLUMN]).strip()
584
+ done_row_ids = set(state["done_row_ids"])
585
+ done_row_ids.add(row_id)
586
+ state["done_row_ids"] = sorted(done_row_ids)
587
+
588
+ status = (
589
+ f"Marked row {row_index + 1} as done for now. "
590
+ f"Current product: `{current_value}`. You can still come back and edit it later."
591
+ )
592
+ return (
593
+ state,
594
+ status,
595
+ gr.update(choices=[], value=None),
596
+ gr.update(choices=[], value=None),
597
+ "",
598
+ "",
599
+ )
600
+
601
+
602
+ def finish_mapping(state: dict):
603
+ df_mapped = require_dataframe(state, "df_mapped")
604
+ download_path = save_dataframe_to_excel(df_mapped, "df_mapped_final")
605
+
606
+ done_count = len(state["done_row_ids"])
607
+ total_rows = len(df_mapped)
608
+ status = (
609
+ f"Generated the final mapped Excel file. "
610
+ f"Rows marked done: {done_count}/{total_rows}."
611
+ )
612
+ return status, download_path
613
+
614
+
615
+ with gr.Blocks(title="Multimodal OCR Mapping UI") as demo:
616
+ session_state = gr.State(build_default_state())
617
+ gr.Markdown(
618
+ """
619
+ # Multimodal OCR and Product Mapping
620
+ Upload one ZIP file, run extraction, then refine product matching in the same workspace.
621
+ """
622
+ )
623
+
624
+ with gr.Row():
625
+ with gr.Column(scale=1):
626
+ gr.Markdown('<p class="step-title">Step 1. Information Extraction</p>')
627
+ gr.Markdown('<p class="step-note">Upload the ZIP file and enter the employee code.</p>')
628
+
629
+ zip_input = gr.File(label="ZIP File", file_types=[".zip"], type="filepath")
630
+ employee_code_input = gr.Textbox(label="Employee Code", placeholder="Example: NV001")
631
+ debug_checkbox = gr.Checkbox(label="Debug mode", value=False)
632
+ extract_button = gr.Button("Extract Information", variant="primary")
633
+
634
+ with gr.Column(scale=1):
635
+ extraction_status = gr.Textbox(label="Extraction Status", interactive=False)
636
+ extraction_download = gr.File(label="Download df_ocr", interactive=False)
637
+
638
+ df_ocr_table = gr.Dataframe(
639
+ label="df_ocr",
640
+ interactive=False,
641
+ wrap=False,
642
+ )
643
+
644
+ with gr.Group() as mapping_entry_group:
645
+ gr.Markdown('<p class="step-title">Step 2. Product Matching</p>')
646
+ gr.Markdown('<p class="step-note">Prepare the mapping workspace. The new table starts as a copy of df_ocr.</p>')
647
+ product_matching_button = gr.Button("Product Matching", variant="primary")
648
+ mapping_status = gr.Textbox(label="Mapping Status", interactive=False)
649
+
650
+ with gr.Group() as mapping_workspace_group:
651
+ mapped_table = gr.Dataframe(
652
+ label="df_mapped",
653
+ interactive=False,
654
+ wrap=False,
655
+ )
656
+
657
+ editor_status = gr.Markdown("Click a product name in the mapped table to review candidates.")
658
+
659
+ with gr.Row():
660
+ top5_dropdown = gr.Dropdown(
661
+ label="Top 5 similar products",
662
+ choices=[],
663
+ value=None,
664
+ allow_custom_value=False,
665
+ )
666
+ catalog_search_query = gr.Textbox(
667
+ label="Search in all products",
668
+ placeholder="Type a product keyword to search the full catalog",
669
+ )
670
+
671
+ with gr.Row():
672
+ search_catalog_button = gr.Button("Search in all products")
673
+ apply_button = gr.Button("Apply", variant="primary")
674
+ undo_button = gr.Button("Undo")
675
+ delete_button = gr.Button("Delete", variant="stop")
676
+ done_button = gr.Button("Done")
677
+
678
+ catalog_status = gr.Textbox(label="Catalog Search Status", interactive=False)
679
+ catalog_results_dropdown = gr.Dropdown(
680
+ label="Catalog search results",
681
+ choices=[],
682
+ value=None,
683
+ allow_custom_value=False,
684
+ )
685
+
686
+ with gr.Row():
687
+ finish_button = gr.Button("Finish Mapping", variant="primary")
688
+ mapping_download = gr.File(label="Download df_mapped", interactive=False)
689
+
690
+ extract_button.click(
691
+ fn=process_extraction,
692
+ inputs=[zip_input, employee_code_input, debug_checkbox, session_state],
693
+ outputs=[
694
+ session_state,
695
+ extraction_status,
696
+ df_ocr_table,
697
+ extraction_download,
698
+ mapping_status,
699
+ mapped_table,
700
+ editor_status,
701
+ top5_dropdown,
702
+ catalog_results_dropdown,
703
+ catalog_status,
704
+ catalog_search_query,
705
+ mapping_download,
706
+ ],
707
+ queue=False,
708
+ )
709
+
710
+ product_matching_button.click(
711
+ fn=run_product_matching,
712
+ inputs=[session_state],
713
+ outputs=[
714
+ session_state,
715
+ mapping_status,
716
+ mapped_table,
717
+ editor_status,
718
+ top5_dropdown,
719
+ catalog_results_dropdown,
720
+ catalog_status,
721
+ catalog_search_query,
722
+ mapping_download,
723
+ ],
724
+ queue=False,
725
+ )
726
+
727
+ mapped_table.select(
728
+ fn=handle_product_click,
729
+ inputs=[session_state],
730
+ outputs=[session_state, editor_status, top5_dropdown, catalog_results_dropdown],
731
+ queue=False,
732
+ )
733
+
734
+ search_catalog_button.click(
735
+ fn=search_full_catalog,
736
+ inputs=[catalog_search_query],
737
+ outputs=[catalog_status, catalog_results_dropdown],
738
+ queue=False,
739
+ )
740
+
741
+ apply_button.click(
742
+ fn=apply_product_choice,
743
+ inputs=[session_state, top5_dropdown, catalog_results_dropdown],
744
+ outputs=[
745
+ session_state,
746
+ mapped_table,
747
+ editor_status,
748
+ top5_dropdown,
749
+ catalog_results_dropdown,
750
+ catalog_status,
751
+ catalog_search_query,
752
+ ],
753
+ queue=False,
754
+ )
755
+
756
+ undo_button.click(
757
+ fn=undo_product_choice,
758
+ inputs=[session_state],
759
+ outputs=[
760
+ session_state,
761
+ mapped_table,
762
+ editor_status,
763
+ top5_dropdown,
764
+ catalog_results_dropdown,
765
+ catalog_status,
766
+ catalog_search_query,
767
+ ],
768
+ queue=False,
769
+ )
770
+
771
+ delete_button.click(
772
+ fn=delete_selected_row,
773
+ inputs=[session_state],
774
+ outputs=[
775
+ session_state,
776
+ mapped_table,
777
+ editor_status,
778
+ top5_dropdown,
779
+ catalog_results_dropdown,
780
+ catalog_status,
781
+ catalog_search_query,
782
+ ],
783
+ queue=False,
784
+ )
785
+
786
+ done_button.click(
787
+ fn=mark_row_done,
788
+ inputs=[session_state],
789
+ outputs=[
790
+ session_state,
791
+ editor_status,
792
+ top5_dropdown,
793
+ catalog_results_dropdown,
794
+ catalog_status,
795
+ catalog_search_query,
796
+ ],
797
+ queue=False,
798
+ )
799
+
800
+ finish_button.click(
801
+ fn=finish_mapping,
802
+ inputs=[session_state],
803
+ outputs=[mapping_status, mapping_download],
804
+ queue=False,
805
+ )
806
+
807
+
808
+ if __name__ == "__main__":
809
+ demo.launch(css=CUSTOM_CSS, theme=CUSTOM_THEME, inbrowser=True)