Abrar55 commited on
Commit
ea15156
·
1 Parent(s): 87ea93b
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md DELETED
@@ -1,14 +0,0 @@
1
- ---
2
- title: CHEX
3
- emoji: 🐠
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 6.14.0
8
- python_version: '3.13'
9
- app_file: app.py
10
- pinned: false
11
- short_description: 'CHEX is a fine-tuned Qwen3.5-9B model trained on AMD MI300X '
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/app.cpython-314.pyc ADDED
Binary file (85.4 kB). View file
 
app.py CHANGED
@@ -9,10 +9,14 @@ Tab 3: Analyse Bank Statement — paste / upload a bank statement, get a summary
9
 
10
  from __future__ import annotations
11
 
 
 
12
  import importlib.util
 
13
  import json
14
  import os
15
  import re
 
16
  from enum import Enum
17
  from pathlib import Path
18
  from typing import Optional
@@ -124,7 +128,7 @@ Question: Does this agreement restrict the Recipient from competing with the Dis
124
 
125
  BANK_SYSTEM_PROMPT = """\
126
  You are a financial analysis assistant specialising in bank statement review. \
127
- Given a bank statement (plain text, CSV-derived, or PDF-extracted) and either a \
128
  summary request or a specific question, produce a single JSON object.
129
 
130
  For SUMMARY mode (question is "SUMMARISE"):
@@ -200,165 +204,59 @@ def _parse_summary(raw_text: str) -> BankStatementSummary:
200
  # Model loading
201
  # ---------------------------------------------------------------------------
202
 
203
- MODEL_PATH = os.environ.get("HF_MODEL_REPO", "Abrar55/contractual-hallucination-eliminator")
204
- SAMPLE_DIR = Path(__file__).parent / "sample_contracts"
205
- STATEMENT_DIR = Path(__file__).parent / "sample_statements"
206
 
207
- _pipe = None
208
- _tokenizer = None
209
  model_load_error: Optional[str] = None
210
 
211
- try:
212
- import torch
213
- from peft import PeftModel
214
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
215
-
216
- # The repo contains a LoRA adapter — read base model from adapter_config
217
- from huggingface_hub import hf_hub_download
218
- import json as _json
219
- _adapter_cfg_path = hf_hub_download(MODEL_PATH, "adapter_config.json")
220
- _adapter_cfg = _json.loads(open(_adapter_cfg_path).read())
221
- BASE_MODEL_PATH = _adapter_cfg.get("base_model_name_or_path", MODEL_PATH)
222
- print(f"LoRA adapter detected. Base model: {BASE_MODEL_PATH}")
223
-
224
- print(f"Loading tokenizer from: {MODEL_PATH}")
225
- _tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
226
- if _tokenizer.pad_token is None:
227
- _tokenizer.pad_token = _tokenizer.eos_token
228
-
229
- print(f"Loading base model: {BASE_MODEL_PATH}")
230
- bnb_available = importlib.util.find_spec("bitsandbytes") is not None
231
- cuda_available = torch.cuda.is_available()
232
-
233
- if bnb_available and cuda_available:
234
- from transformers import BitsAndBytesConfig
235
- bnb_config = BitsAndBytesConfig(
236
- load_in_4bit=True,
237
- bnb_4bit_quant_type="nf4",
238
- bnb_4bit_compute_dtype=torch.bfloat16,
239
- bnb_4bit_use_double_quant=True,
240
- )
241
- _base = AutoModelForCausalLM.from_pretrained(
242
- BASE_MODEL_PATH,
243
- quantization_config=bnb_config,
244
- device_map="auto",
245
- trust_remote_code=True,
246
- )
247
- print(" Base loaded with 4-bit NF4 quantization")
248
- else:
249
- dtype = torch.float16 if cuda_available else torch.float32
250
- _base = AutoModelForCausalLM.from_pretrained(
251
- BASE_MODEL_PATH,
252
- torch_dtype=dtype,
253
- device_map="auto" if cuda_available else None,
254
- trust_remote_code=True,
255
- )
256
- print(f" Base loaded in {'fp16 (GPU)' if cuda_available else 'fp32 (CPU)'}")
257
-
258
- print(f"Applying LoRA adapter from: {MODEL_PATH}")
259
- _model = PeftModel.from_pretrained(_base, MODEL_PATH)
260
- _model.eval()
261
- print(" LoRA adapter applied")
262
-
263
- _pipe = pipeline(
264
- "text-generation",
265
- model=_model,
266
- tokenizer=_tokenizer,
267
- max_new_tokens=512,
268
- do_sample=False,
269
- return_full_text=False,
270
- pad_token_id=_tokenizer.eos_token_id,
271
- )
272
- print(f"Model loaded successfully: {MODEL_PATH}")
273
-
274
- except Exception as e:
275
- model_load_error = str(e)
276
- print(f"WARNING: Model failed to load: {e}")
277
- print("Demo is running in preview mode — analysis will return a placeholder response.")
278
 
279
 
280
  # ---------------------------------------------------------------------------
281
  # Inference helpers
282
  # ---------------------------------------------------------------------------
283
 
284
- MAX_TOKENS = 8192
285
 
286
 
287
  def _truncate(text: str) -> str:
288
- if _tokenizer is None:
289
- return text
290
- tokens = _tokenizer.encode(text, add_special_tokens=False)
291
- if len(tokens) > MAX_TOKENS:
292
- print(f"WARNING: Text truncated from {len(tokens)} to {MAX_TOKENS} tokens.")
293
- tokens = tokens[:MAX_TOKENS]
294
- return _tokenizer.decode(tokens, skip_special_tokens=True)
295
  return text
296
 
297
 
298
- def _apply_template(messages: list[dict], strict: bool = False) -> str:
299
  if strict:
300
  messages = list(messages)
301
  messages[-1] = dict(messages[-1])
302
  messages[-1]["content"] += STRICT_SUFFIX
303
- if _tokenizer is not None:
304
- try:
305
- return _tokenizer.apply_chat_template(
306
- messages, tokenize=False, add_generation_prompt=True
307
- )
308
- except Exception:
309
- pass
310
- # Fallback: plain text
311
- parts = []
312
- for m in messages:
313
- parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>")
314
- parts.append("<|im_start|>assistant\n")
315
- return "\n".join(parts)
316
 
317
 
318
- def _run_pipe(prompt: str) -> str:
319
- result = _pipe(prompt)
320
- return result[0]["generated_text"]
321
-
322
-
323
- # ---------------------------------------------------------------------------
324
- # MLX Remote Server (Mac Mini via ngrok) - takes priority when MLX_SERVER_URL is set
325
- # ---------------------------------------------------------------------------
326
- _MLX_SERVER_URL = os.environ.get("MLX_SERVER_URL", "").rstrip("/")
327
- _mlx_available = False
328
-
329
- if _MLX_SERVER_URL:
330
- try:
331
- import urllib.request as _ur
332
- _ur.urlopen(_MLX_SERVER_URL + "/v1/models", timeout=5)
333
- _mlx_available = True
334
- print("MLX remote server ready: " + _MLX_SERVER_URL)
335
- except Exception as _e:
336
- print("MLX server unreachable (" + str(_e) + "), falling back to local model.")
337
-
338
-
339
- def _run_via_mlx(messages, strict=False):
340
- import urllib.request as _ur, json as _j
341
- msgs = list(messages)
342
- if strict:
343
- msgs[-1] = dict(msgs[-1])
344
- msgs[-1]["content"] += STRICT_SUFFIX
345
- payload = _j.dumps({
346
- "model": "mlx-community/Qwen3.5-9B-MLX-4bit",
347
- "messages": msgs,
348
  "max_tokens": 512,
349
  "temperature": 0.0,
350
  }).encode()
351
- req = _ur.Request(
352
- _MLX_SERVER_URL + "/v1/chat/completions",
353
  data=payload,
354
  headers={"Content-Type": "application/json"},
 
355
  )
356
- with _ur.urlopen(req, timeout=120) as resp:
357
- data = _j.loads(resp.read())
358
  return data["choices"][0]["message"]["content"]
359
 
360
 
361
-
362
  # ---------------------------------------------------------------------------
363
  # Sample contract content
364
  # ---------------------------------------------------------------------------
@@ -428,25 +326,21 @@ def analyze_contract(contract_text: str, question: str) -> tuple[str, str, str,
428
  return format_label_html("N/A"), "", "", "Please paste a contract above."
429
  if not question.strip():
430
  return format_label_html("N/A"), "", "", "Please enter a question."
431
- if _pipe is None and not _mlx_available:
432
  return (
433
  format_label_html("N/A"),
434
  "Model not loaded",
435
  "",
436
- f"Model failed to load: {model_load_error}. "
437
- "Set HF_MODEL_REPO in Space secrets to the correct model repo.",
438
  )
439
 
440
  contract_text = _truncate(contract_text)
441
  messages = _build_contract_messages(contract_text, question)
442
 
443
  for attempt in range(2):
 
444
  try:
445
- if _mlx_available:
446
- raw = _run_via_mlx(messages, strict=(attempt == 1))
447
- else:
448
- prompt = _apply_template(messages, strict=(attempt == 1))
449
- raw = _run_pipe(prompt)
450
  result = _parse_model_output(raw, question)
451
  label_html = format_label_html(result.label.value)
452
  answer = result.answer or "(none — clause is absent or not applicable)"
@@ -466,90 +360,581 @@ def analyze_contract(contract_text: str, question: str) -> tuple[str, str, str,
466
  )
467
 
468
 
469
- def _get_statement_text(paste_text: str, pdf_file, csv_file) -> tuple[str, str]:
470
- if pdf_file is not None:
471
- if _pipe is None:
472
- return "", "Model not loaded — PDF extraction unavailable."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  try:
474
  if importlib.util.find_spec("pdfplumber") is None:
475
- return "", "pdfplumber not installed."
476
- import pdfplumber
477
- text_parts = []
478
- with pdfplumber.open(str(pdf_file)) as pdf:
479
- for page in pdf.pages:
480
- t = page.extract_text()
481
- if t:
482
- text_parts.append(t)
483
- text = "\n".join(text_parts)
484
- if not text.strip():
485
- return "", "PDF was uploaded but no text could be extracted."
486
- return text, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  except Exception as e:
488
- return "", f"PDF extraction error: {e}"
489
 
490
- if csv_file is not None:
491
- if _pipe is None:
492
- return "", "Model not loaded — CSV parsing unavailable."
493
  try:
494
  import pandas as pd
495
- df = pd.read_csv(str(csv_file))
496
- df.columns = [c.strip().lower() for c in df.columns]
497
- lines = []
498
- for _, row in df.iterrows():
499
- parts = [str(v).strip() for v in row.values if str(v).strip() not in ("", "nan")]
500
- lines.append(", ".join(parts))
501
- return ", ".join(df.columns.tolist()) + "\n" + "\n".join(lines), ""
502
- except Exception as e:
503
- return "", f"CSV parsing error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
- if paste_text and paste_text.strip():
506
- return paste_text.strip(), ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
 
508
- return "", "Please paste a bank statement or upload a PDF / CSV file."
 
 
 
509
 
 
 
 
 
 
510
 
511
- def analyse_bank_statement(paste_text: str, pdf_file, csv_file) -> tuple[str, str]:
512
- statement_text, error = _get_statement_text(paste_text, pdf_file, csv_file)
513
- if error:
514
- return f"**Error:** {error}", ""
515
- if _pipe is None and not _mlx_available:
516
  return (
517
- f"**Model not loaded.** Set `HF_MODEL_REPO` in Space secrets. Error: {model_load_error}",
518
- statement_text,
 
519
  )
520
 
521
- statement_text = _truncate(statement_text)
522
- messages = _build_bank_messages(statement_text, "SUMMARISE")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
- for attempt in range(2):
525
- try:
526
- if _mlx_available:
527
- raw = _run_via_mlx(messages, strict=(attempt == 1))
528
- else:
529
- prompt = _apply_template(messages, strict=(attempt == 1))
530
- raw = _run_pipe(prompt)
531
- summary = _parse_summary(raw)
532
- lines = ["## Statement Summary", ""]
533
- lines.append(f"**Total Credits:** {summary.total_credits or 'N/A'}")
534
- lines.append(f"**Total Debits:** {summary.total_debits or 'N/A'}")
535
- lines.append(f"**Largest Transaction:** {summary.largest_transaction or 'N/A'}")
536
- if summary.recurring_payments:
537
- lines.append("\n**Recurring Payments:**")
538
- for p in summary.recurring_payments:
539
- lines.append(f"- {p}")
540
- if summary.flags:
541
- lines.append("\n**Flags / Unusual Activity:**")
542
- for f in summary.flags:
543
- lines.append(f"- {f}")
544
- lines.append(f"\n*{summary.raw_reasoning}*")
545
- return "\n".join(lines), statement_text
546
- except Exception as e:
547
- if attempt == 0:
548
- print(f" Summary parse attempt 1 failed ({e}). Retrying...")
549
- else:
550
- print(f" Summary parse attempt 2 failed ({e}). Returning error.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
 
552
- return "**Summarisation error:** could not parse model output.", statement_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
 
555
  def bank_qa(statement_text: str, question: str) -> tuple[str, str, str, str]:
@@ -560,22 +945,19 @@ def bank_qa(statement_text: str, question: str) -> tuple[str, str, str, str]:
560
  )
561
  if not question.strip():
562
  return format_label_html("N/A"), "", "", "Please enter a question."
563
- if _pipe is None and not _mlx_available:
564
  return (
565
- format_label_html("N/A"), "Model not loaded", "",
566
- f"Model failed to load: {model_load_error}.",
567
  )
568
 
569
  statement_text = _truncate(statement_text)
570
  messages = _build_bank_messages(statement_text, question)
571
 
572
  for attempt in range(2):
 
573
  try:
574
- if _mlx_available:
575
- raw = _run_via_mlx(messages, strict=(attempt == 1))
576
- else:
577
- prompt = _apply_template(messages, strict=(attempt == 1))
578
- raw = _run_pipe(prompt)
579
  result = _parse_model_output(raw, question)
580
  label_html = format_label_html(result.label.value)
581
  answer = result.answer or "(none — information not found in statement)"
@@ -665,36 +1047,34 @@ CHEX_CSS = """
665
  *, *::before, *::after { box-sizing: border-box; }
666
 
667
  :root {
668
- --bg-base: #f3f4f7;
669
- --bg-grad: radial-gradient(ellipse 1200px 700px at 18% -10%, rgba(120,150,200,0.18), transparent 60%),
670
- radial-gradient(ellipse 900px 600px at 95% 110%, rgba(180,160,220,0.14), transparent 55%),
671
- linear-gradient(180deg, #f5f6f9 0%, #eef0f4 100%);
672
- --bg-elev: rgba(255,255,255,0.62);
673
- --bg-elev-strong: rgba(255,255,255,0.78);
674
- --bg-sunken: rgba(245,246,249,0.55);
675
- --bg-input: rgba(255,255,255,0.55);
676
- --border: rgba(15,18,30,0.08);
677
- --border-strong: rgba(15,18,30,0.14);
678
- --hairline: rgba(15,18,30,0.06);
679
- --fg: #0d1220;
680
- --fg-muted: #5b6275;
681
- --fg-subtle: #8a91a3;
682
- --green: #0f9d58;
683
- --green-bg: rgba(34,197,94,0.10);
684
- --green-border: rgba(34,197,94,0.28);
685
- --red: #d23131;
686
- --red-bg: rgba(239,68,68,0.09);
687
- --red-border: rgba(239,68,68,0.28);
688
- --amber: #b87800;
689
  --amber-bg: rgba(245,158,11,0.10);
690
- --amber-border: rgba(245,158,11,0.30);
691
- --blur: 22px;
692
  --blur-strong: 32px;
693
- --shadow-md: 0 1px 0 rgba(255,255,255,0.6) inset,
694
- 0 8px 24px rgba(15,18,30,0.06),
695
- 0 1px 2px rgba(15,18,30,0.04);
696
  --radius: 10px;
697
- --radius-lg: 16px;
698
  }
699
 
700
  body {
@@ -797,19 +1177,19 @@ label.block, .label-wrap {
797
  position: sticky;
798
  top: 0;
799
  z-index: 100;
800
- background: var(--bg-elev);
801
  backdrop-filter: blur(var(--blur-strong)) saturate(160%);
802
  -webkit-backdrop-filter: blur(var(--blur-strong)) saturate(160%);
803
  border-bottom: 1px solid var(--hairline);
804
  }
805
 
806
  .chex-logo {
807
- width: 26px; height: 26px; border-radius: 8px;
808
- background: linear-gradient(135deg, #0d1220, rgba(13,18,32,0.7));
809
- color: #f3f4f7; display: grid; place-items: center;
810
  font-family: 'JetBrains Mono', monospace; font-weight: 700; font-size: 11px;
811
  letter-spacing: -0.05em;
812
- box-shadow: 0 4px 14px rgba(15,18,30,0.18), 0 1px 0 rgba(255,255,255,0.25) inset;
813
  flex-shrink: 0;
814
  }
815
 
@@ -935,8 +1315,8 @@ textarea, input[type="text"], input[type="search"],
935
  textarea:focus, input[type="text"]:focus,
936
  .gradio-container [data-testid="textbox"] textarea:focus,
937
  .gradio-container [data-testid="textbox"] input:focus {
938
- border-color: var(--border-strong) !important; background: var(--bg-elev-strong) !important;
939
- box-shadow: 0 0 0 4px rgba(13,18,32,0.08) !important; outline: none !important;
940
  }
941
 
942
  textarea::placeholder, input::placeholder { color: var(--fg-subtle) !important; }
@@ -957,16 +1337,15 @@ textarea[readonly],
957
 
958
  .gradio-container button.primary, button.primary {
959
  background: var(--fg) !important; color: var(--bg-base) !important; border: 1px solid var(--fg) !important;
960
- box-shadow: 0 6px 18px rgba(13,18,32,0.28), 0 1px 0 rgba(255,255,255,0.1) inset !important;
961
  }
962
- .gradio-container button.primary:hover, button.primary:hover { opacity: 0.88 !important; box-shadow: 0 4px 12px rgba(13,18,32,0.22) !important; }
963
 
964
  .gradio-container button.secondary, button.secondary {
965
- background: var(--bg-elev) !important; backdrop-filter: blur(10px) !important;
966
- -webkit-backdrop-filter: blur(10px) !important; color: var(--fg) !important;
967
- border: 1px solid var(--border) !important; box-shadow: var(--shadow-md) !important;
968
  }
969
- .gradio-container button.secondary:hover, button.secondary:hover { background: var(--bg-elev-strong) !important; border-color: var(--border-strong) !important; }
970
 
971
  button.sm, .gradio-container button[size="sm"], button.small { font-size: 12px !important; padding: 7px 11px !important; }
972
 
@@ -1147,7 +1526,7 @@ STATEMENT_SOURCE_HEADER_HTML = """
1147
  <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="opacity:0.55"><rect x="2" y="5" width="20" height="14" rx="2"/><line x1="2" y1="10" x2="22" y2="10"/></svg>
1148
  Bank Statement
1149
  </span>
1150
- <span class="chex-card-kicker">paste · pdf · csv</span>
1151
  </div>
1152
  """
1153
 
@@ -1221,22 +1600,64 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
1221
  with gr.Tabs():
1222
  with gr.Tab("Paste text"):
1223
  bank_paste_input = gr.Textbox(
1224
- label="Bank statement text",
1225
  lines=20,
1226
- placeholder="Paste your bank statement here, or load the sample below…",
 
 
 
 
 
1227
  show_label=False,
1228
  )
1229
  btn_load_statement = gr.Button("Load sample statement", variant="secondary", size="sm")
1230
  with gr.Tab("Upload PDF"):
1231
- bank_pdf_input = gr.File(label="PDF bank statement", file_types=[".pdf"])
 
 
 
 
 
 
 
 
 
 
1232
  with gr.Tab("Upload CSV"):
1233
- bank_csv_input = gr.File(label="CSV bank statement", file_types=[".csv"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1234
 
1235
  with gr.Column(scale=11):
1236
  with gr.Group():
1237
  gr.HTML(STATEMENT_RESULTS_HEADER_HTML)
1238
  analyse_stmt_btn = gr.Button("Analyse statement", variant="primary")
1239
  summary_output = gr.Markdown(value="*Run 'Analyse statement' to generate a financial summary.*")
 
 
 
 
 
1240
  gr.HTML('<div class="chex-divider"></div>')
1241
  gr.HTML('<span class="chex-section-kicker">Ask a question</span>')
1242
  with gr.Row():
@@ -1254,6 +1675,11 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
1254
  bank_reasoning_output = gr.Textbox(label="Reasoning", interactive=False, lines=3)
1255
 
1256
  bank_statement_state = gr.State("")
 
 
 
 
 
1257
 
1258
  # ── Tab 03: Benchmark ──────────────────────────────────────────── #
1259
  with gr.Tab("03 Benchmark"):
@@ -1290,19 +1716,40 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
1290
  fn=analyze_contract,
1291
  inputs=[contract_input, question_input],
1292
  outputs=[label_display, answer_output, citation_output, reasoning_output],
 
1293
  )
1294
  question_input.submit(
1295
  fn=analyze_contract,
1296
  inputs=[contract_input, question_input],
1297
  outputs=[label_display, answer_output, citation_output, reasoning_output],
 
1298
  )
1299
 
1300
  btn_load_statement.click(fn=lambda: SAMPLE_STATEMENT, inputs=[], outputs=[bank_paste_input])
1301
 
1302
  analyse_stmt_btn.click(
1303
  fn=analyse_bank_statement,
1304
- inputs=[bank_paste_input, bank_pdf_input, bank_csv_input],
1305
- outputs=[summary_output, bank_statement_state],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1306
  )
1307
 
1308
  bank_ask_btn.click(
@@ -1316,6 +1763,59 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
1316
  outputs=[bank_label_display, bank_answer_output, bank_citation_output, bank_reasoning_output],
1317
  )
1318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1319
 
1320
  if __name__ == "__main__":
1321
  demo.launch(show_error=True, theme=gr.themes.Base(), css=CHEX_CSS, ssr_mode=False)
 
9
 
10
  from __future__ import annotations
11
 
12
+ import csv
13
+ import datetime as _dt
14
  import importlib.util
15
+ import io
16
  import json
17
  import os
18
  import re
19
+ import tempfile
20
  from enum import Enum
21
  from pathlib import Path
22
  from typing import Optional
 
128
 
129
  BANK_SYSTEM_PROMPT = """\
130
  You are a financial analysis assistant specialising in bank statement review. \
131
+ Given a bank statement (plain text, CSV/Excel-derived, OFX/QFX-derived, or PDF-extracted) and either a \
132
  summary request or a specific question, produce a single JSON object.
133
 
134
  For SUMMARY mode (question is "SUMMARISE"):
 
204
  # Model loading
205
  # ---------------------------------------------------------------------------
206
 
207
+ MLX_SERVER_URL = os.environ.get("MLX_SERVER_URL", "").rstrip("/")
208
+ SAMPLE_DIR = Path(__file__).parent / "sample_contracts"
209
+ STATEMENT_DIR = Path(__file__).parent / "sample_statements"
210
 
 
 
211
  model_load_error: Optional[str] = None
212
 
213
+ if not MLX_SERVER_URL:
214
+ model_load_error = "MLX_SERVER_URL not set. Set it in Space secrets to your Mac's ngrok URL."
215
+ print(f"WARNING: {model_load_error}")
216
+ else:
217
+ print(f"MLX server configured at: {MLX_SERVER_URL}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
 
220
  # ---------------------------------------------------------------------------
221
  # Inference helpers
222
  # ---------------------------------------------------------------------------
223
 
224
+ MAX_CHARS = 32000 # rough character limit (~8k tokens) to keep requests fast
225
 
226
 
227
  def _truncate(text: str) -> str:
228
+ if len(text) > MAX_CHARS:
229
+ print(f"WARNING: Text truncated from {len(text)} to {MAX_CHARS} chars.")
230
+ return text[:MAX_CHARS]
 
 
 
 
231
  return text
232
 
233
 
234
+ def _apply_messages(messages: list[dict], strict: bool = False) -> list[dict]:
235
  if strict:
236
  messages = list(messages)
237
  messages[-1] = dict(messages[-1])
238
  messages[-1]["content"] += STRICT_SUFFIX
239
+ return messages
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
 
242
+ def _run_inference(messages: list[dict]) -> str:
243
+ import urllib.request
244
+ payload = json.dumps({
245
+ "messages": messages,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  "max_tokens": 512,
247
  "temperature": 0.0,
248
  }).encode()
249
+ req = urllib.request.Request(
250
+ f"{MLX_SERVER_URL}/v1/chat/completions",
251
  data=payload,
252
  headers={"Content-Type": "application/json"},
253
+ method="POST",
254
  )
255
+ with urllib.request.urlopen(req, timeout=120) as resp:
256
+ data = json.loads(resp.read())
257
  return data["choices"][0]["message"]["content"]
258
 
259
 
 
260
  # ---------------------------------------------------------------------------
261
  # Sample contract content
262
  # ---------------------------------------------------------------------------
 
326
  return format_label_html("N/A"), "", "", "Please paste a contract above."
327
  if not question.strip():
328
  return format_label_html("N/A"), "", "", "Please enter a question."
329
+ if not MLX_SERVER_URL:
330
  return (
331
  format_label_html("N/A"),
332
  "Model not loaded",
333
  "",
334
+ f"Model failed to load: {model_load_error}.",
 
335
  )
336
 
337
  contract_text = _truncate(contract_text)
338
  messages = _build_contract_messages(contract_text, question)
339
 
340
  for attempt in range(2):
341
+ msgs = _apply_messages(messages, strict=(attempt == 1))
342
  try:
343
+ raw = _run_inference(msgs)
 
 
 
 
344
  result = _parse_model_output(raw, question)
345
  label_html = format_label_html(result.label.value)
346
  answer = result.answer or "(none — clause is absent or not applicable)"
 
360
  )
361
 
362
 
363
+ def _get_statement_text(
364
+ paste_text: str,
365
+ pdf_file,
366
+ pdf_password: str | None,
367
+ csv_file,
368
+ txt_file,
369
+ xlsx_file,
370
+ ofx_file,
371
+ ) -> tuple[str, str]:
372
+ # Backwards-compatible shim: treat "single statement" inputs as one item.
373
+ texts, errors = _get_statement_texts(
374
+ paste_text,
375
+ pdf_file,
376
+ pdf_password,
377
+ csv_file,
378
+ txt_file,
379
+ xlsx_file,
380
+ ofx_file,
381
+ )
382
+ if not texts:
383
+ return (
384
+ "",
385
+ errors[0]
386
+ if errors
387
+ else "Please paste a bank statement or upload a PDF / CSV / TXT / XLSX / OFX/QFX file."
388
+ )
389
+ return texts[0], ""
390
+
391
+
392
+ def _ensure_file_list(files) -> list:
393
+ if files is None:
394
+ return []
395
+ if isinstance(files, (list, tuple)):
396
+ return [f for f in files if f is not None]
397
+ return [files]
398
+
399
+
400
+ def _split_statements(paste_text: str) -> list[str]:
401
+ """
402
+ Split pasted content into multiple statements.
403
+
404
+ Delimiter: a line containing only `---` (3+ dashes), optionally surrounded by whitespace.
405
+ """
406
+ text = (paste_text or "").strip()
407
+ if not text:
408
+ return []
409
+ parts = re.split(r"(?m)^[ \t]*-{3,}[ \t]*$", text)
410
+ cleaned = [p.strip() for p in parts if p.strip()]
411
+ return cleaned if cleaned else [text]
412
+
413
+
414
+ def _get_statement_texts(
415
+ paste_text: str,
416
+ pdf_files,
417
+ pdf_password: str | None,
418
+ csv_files,
419
+ txt_files,
420
+ xlsx_files,
421
+ ofx_files,
422
+ ) -> tuple[list[str], list[str]]:
423
+ """
424
+ Extract statement text blocks from:
425
+ - pasted text (can contain multiple statements separated by `---`)
426
+ - uploaded PDFs (supports multiple)
427
+ - uploaded CSVs (supports multiple)
428
+ - uploaded TXT files (supports multiple)
429
+ - uploaded Excel (.xlsx) (supports multiple)
430
+ - uploaded OFX/QFX files (supports multiple)
431
+ """
432
+ statement_texts: list[str] = []
433
+ errors: list[str] = []
434
+
435
+ pdf_list = _ensure_file_list(pdf_files)
436
+ csv_list = _ensure_file_list(csv_files)
437
+ txt_list = _ensure_file_list(txt_files)
438
+ xlsx_list = _ensure_file_list(xlsx_files)
439
+ ofx_list = _ensure_file_list(ofx_files)
440
+
441
+ # PDFs
442
+ if pdf_list:
443
  try:
444
  if importlib.util.find_spec("pdfplumber") is None:
445
+ errors.append("pdfplumber not installed.")
446
+ else:
447
+ import pdfplumber
448
+ password = (pdf_password or "").strip()
449
+ for idx, pdf_file in enumerate(pdf_list):
450
+ try:
451
+ text_parts: list[str] = []
452
+ try:
453
+ with pdfplumber.open(
454
+ str(pdf_file),
455
+ password=password if password else "",
456
+ ) as pdf:
457
+ for page in pdf.pages:
458
+ t = page.extract_text()
459
+ if t:
460
+ text_parts.append(t)
461
+ except TypeError:
462
+ # Older pdfplumber versions may not accept `password=...`
463
+ with pdfplumber.open(str(pdf_file)) as pdf:
464
+ for page in pdf.pages:
465
+ t = page.extract_text()
466
+ if t:
467
+ text_parts.append(t)
468
+ text = "\n".join(text_parts).strip()
469
+ if not text:
470
+ errors.append(f"PDF #{idx+1} uploaded but no text could be extracted.")
471
+ else:
472
+ statement_texts.append(text)
473
+ except Exception as e:
474
+ msg = str(e).lower()
475
+ if "password" in msg or "encrypted" in msg or "decrypt" in msg:
476
+ errors.append(
477
+ f"PDF #{idx+1} is password-protected. Please enter the correct password."
478
+ )
479
+ else:
480
+ errors.append(f"PDF #{idx+1} extraction error: {e}")
481
  except Exception as e:
482
+ errors.append(f"PDF extraction error: {e}")
483
 
484
+ # CSVs
485
+ if csv_list:
 
486
  try:
487
  import pandas as pd
488
+ except Exception:
489
+ if importlib.util.find_spec("pandas") is None:
490
+ errors.append("pandas not installed.")
491
+ else:
492
+ errors.append("CSV parsing error: pandas import failed.")
493
+ else:
494
+ for idx, csv_file in enumerate(csv_list):
495
+ try:
496
+ df = pd.read_csv(str(csv_file))
497
+ df.columns = [c.strip().lower() for c in df.columns]
498
+ lines: list[str] = []
499
+ for _, row in df.iterrows():
500
+ parts = [
501
+ str(v).strip()
502
+ for v in row.values
503
+ if str(v).strip() not in ("", "nan")
504
+ ]
505
+ lines.append(", ".join(parts))
506
+ statement_texts.append(
507
+ ", ".join(df.columns.tolist()) + "\n" + "\n".join(lines)
508
+ )
509
+ except Exception as e:
510
+ errors.append(f"CSV #{idx+1} parsing error: {e}")
511
+
512
+ # TXT
513
+ if txt_list:
514
+ for idx, txt_file in enumerate(txt_list):
515
+ try:
516
+ # Read best-effort encoding; then reuse the same delimiter splitting
517
+ # strategy as pasted input.
518
+ p = Path(str(txt_file))
519
+ content = p.read_text(encoding="utf-8", errors="replace")
520
+ parts = _split_statements(content)
521
+ if not parts:
522
+ errors.append(f"TXT #{idx+1} uploaded but no text could be read.")
523
+ else:
524
+ statement_texts.extend(parts)
525
+ except Exception as e:
526
+ errors.append(f"TXT #{idx+1} parsing error: {e}")
527
+
528
+ # XLSX (Excel)
529
+ if xlsx_list:
530
+ try:
531
+ import pandas as pd
532
+ except Exception:
533
+ if importlib.util.find_spec("pandas") is None:
534
+ errors.append("pandas not installed.")
535
+ else:
536
+ errors.append("Excel parsing error: pandas import failed.")
537
+ else:
538
+ for idx, xlsx_file in enumerate(xlsx_list):
539
+ try:
540
+ df = pd.read_excel(str(xlsx_file), sheet_name=0)
541
+ if df is None or df.empty:
542
+ errors.append(f"XLSX #{idx+1} uploaded but no rows were found.")
543
+ continue
544
+ df.columns = [str(c).strip().lower() for c in df.columns]
545
+ lines: list[str] = []
546
+ for _, row in df.iterrows():
547
+ parts = [
548
+ str(v).strip()
549
+ for v in row.values
550
+ if str(v).strip() not in ("", "nan", "NaN")
551
+ ]
552
+ lines.append(", ".join(parts))
553
+ statement_texts.append(
554
+ ", ".join(df.columns.tolist()) + "\n" + "\n".join(lines)
555
+ )
556
+ except Exception as e:
557
+ errors.append(f"XLSX #{idx+1} parsing error: {e}")
558
+
559
+ # OFX/QFX (lightweight tag extraction)
560
+ if ofx_list:
561
+ def _format_ofx_date(d: str) -> str:
562
+ d = (d or "").strip()
563
+ if len(d) == 8 and d.isdigit():
564
+ return f"{d[:4]}-{d[4:6]}-{d[6:]}"
565
+ return d
566
+
567
+ for idx, ofx_file in enumerate(ofx_list):
568
+ try:
569
+ p = Path(str(ofx_file))
570
+ raw = p.read_bytes()
571
+ try:
572
+ content = raw.decode("utf-8")
573
+ except UnicodeDecodeError:
574
+ content = raw.decode("utf-8", errors="replace")
575
+
576
+ blocks = re.findall(
577
+ r"<STMTTRN>(.*?)</STMTTRN>",
578
+ content,
579
+ flags=re.IGNORECASE | re.DOTALL,
580
+ )
581
+
582
+ def _get_tag(block: str, tag: str) -> str:
583
+ m = re.search(rf"<{tag}>([^<]*)", block, flags=re.IGNORECASE)
584
+ return (m.group(1) if m else "").strip()
585
+
586
+ lines: list[str] = []
587
+ for b in blocks:
588
+ dt = _get_tag(b, "DTPOSTED") or _get_tag(b, "DTTRAN")
589
+ name = _get_tag(b, "NAME") or _get_tag(b, "PAYEE")
590
+ memo = _get_tag(b, "MEMO") or _get_tag(b, "TRNTYPE")
591
+ amt = _get_tag(b, "TRNAMT") or _get_tag(b, "AMOUNT")
592
+
593
+ if not any([dt, name, memo, amt]):
594
+ continue
595
+
596
+ dt = _format_ofx_date(dt)
597
+ desc_parts = [p for p in [name, memo] if p]
598
+ desc = " - ".join(desc_parts) if desc_parts else "Transaction"
599
+ lines.append(f"{dt}, {desc}, {amt}".strip(", "))
600
+
601
+ if lines:
602
+ statement_texts.append("Date, Description, Amount\n" + "\n".join(lines))
603
+ else:
604
+ # Fall back to returning the raw content (truncated).
605
+ statement_texts.append(content.strip()[:20000])
606
+ except Exception as e:
607
+ errors.append(f"OFX/QFX #{idx+1} parsing error: {e}")
608
+
609
+ # Paste text (may contain multiple statements)
610
+ pasted_parts = _split_statements(paste_text)
611
+ if pasted_parts:
612
+ statement_texts.extend(pasted_parts)
613
+
614
+ if not statement_texts:
615
+ errors.append(
616
+ "Please paste a bank statement or upload a PDF / CSV / TXT / XLSX / OFX/QFX file(s)."
617
+ )
618
 
619
+ return statement_texts, errors
620
+
621
+
622
+ def analyse_bank_statement(
623
+ paste_text: str,
624
+ pdf_file,
625
+ pdf_password: str | None,
626
+ csv_file,
627
+ txt_file,
628
+ xlsx_file,
629
+ ofx_file,
630
+ ) -> tuple[str, str, str]:
631
+ statement_texts, errors = _get_statement_texts(
632
+ paste_text,
633
+ pdf_file,
634
+ pdf_password,
635
+ csv_file,
636
+ txt_file,
637
+ xlsx_file,
638
+ ofx_file,
639
+ )
640
+ if not statement_texts:
641
+ return f"**Error:** {errors[0] if errors else 'No bank statement provided.'}", "", ""
642
 
643
+ MAX_STATEMENTS = 6
644
+ if len(statement_texts) > MAX_STATEMENTS:
645
+ errors.append(f"Too many statements provided; only the first {MAX_STATEMENTS} were used.")
646
+ statement_texts = statement_texts[:MAX_STATEMENTS]
647
 
648
+ combined_text = "\n\n".join(
649
+ f"===== Statement {i+1}/{len(statement_texts)} =====\n\n{st.strip()}"
650
+ for i, st in enumerate(statement_texts)
651
+ if st.strip()
652
+ ).strip()
653
 
654
+ if not MLX_SERVER_URL:
 
 
 
 
655
  return (
656
+ f"**Inference client not initialised.** Error: {model_load_error}",
657
+ combined_text,
658
+ "",
659
  )
660
 
661
+ summaries: list[BankStatementSummary] = []
662
+ for idx, statement_text in enumerate(statement_texts):
663
+ statement_text = _truncate(statement_text)
664
+ messages = _build_bank_messages(statement_text, "SUMMARISE")
665
+
666
+ summary: BankStatementSummary | None = None
667
+ for attempt in range(2):
668
+ msgs = _apply_messages(messages, strict=(attempt == 1))
669
+ try:
670
+ raw = _run_inference(msgs)
671
+ summary = _parse_summary(raw)
672
+ break
673
+ except Exception as e:
674
+ if attempt == 0:
675
+ print(f" Summary parse attempt 1 failed (statement {idx+1}, {e}). Retrying...")
676
+ else:
677
+ print(f" Summary parse attempt 2 failed (statement {idx+1}, {e}). Returning error.")
678
+
679
+ if summary is None:
680
+ summary = BankStatementSummary(
681
+ raw_reasoning=f"Could not parse model output for statement {idx+1}."
682
+ )
683
+ summaries.append(summary)
684
+
685
+ # Render markdown
686
+ lines: list[str] = []
687
+ lines.append("## Statements Summary")
688
+ lines.append("")
689
+ if errors:
690
+ lines.append("**Notes:**")
691
+ for e in errors:
692
+ lines.append(f"- {e}")
693
+ lines.append("")
694
+
695
+ for idx, summary in enumerate(summaries):
696
+ lines.append(f"### Statement {idx+1}")
697
+ lines.append(f"**Total Credits:** {summary.total_credits or 'N/A'}")
698
+ lines.append(f"**Total Debits:** {summary.total_debits or 'N/A'}")
699
+ lines.append(
700
+ f"**Largest Transaction:** {summary.largest_transaction or 'N/A'}"
701
+ )
702
+ if summary.recurring_payments:
703
+ lines.append("\n**Recurring Payments:**")
704
+ for p in summary.recurring_payments:
705
+ lines.append(f"- {p}")
706
+ if summary.flags:
707
+ lines.append("\n**Flags / Unusual Activity:**")
708
+ for f in summary.flags:
709
+ lines.append(f"- {f}")
710
+ lines.append(f"\n*{summary.raw_reasoning}*")
711
+ lines.append("")
712
+
713
+ # Overall union (useful across multiple statements)
714
+ overall_recurring: list[str] = []
715
+ overall_flags: list[str] = []
716
+ for s in summaries:
717
+ for r in (s.recurring_payments or []):
718
+ if r not in overall_recurring:
719
+ overall_recurring.append(r)
720
+ for f in (s.flags or []):
721
+ if f not in overall_flags:
722
+ overall_flags.append(f)
723
+
724
+ lines.append("## Overall (union across statements)")
725
+ if overall_recurring:
726
+ lines.append("\n**Recurring Payments (union):**")
727
+ for p in overall_recurring:
728
+ lines.append(f"- {p}")
729
+ else:
730
+ lines.append("\n**Recurring Payments (union):** N/A")
731
 
732
+ if overall_flags:
733
+ lines.append("\n**Flags / Unusual Activity (union):**")
734
+ for f in overall_flags:
735
+ lines.append(f"- {f}")
736
+ else:
737
+ lines.append("\n**Flags / Unusual Activity (union):** N/A")
738
+
739
+ summary_json = json.dumps([s.model_dump() for s in summaries], ensure_ascii=False)
740
+ return "\n".join(lines).strip(), combined_text, summary_json
741
+
742
+
743
+ def _safe_json_loads(s: str) -> object:
744
+ try:
745
+ obj = json.loads(s or "")
746
+ if isinstance(obj, (dict, list)):
747
+ return obj
748
+ return {}
749
+ except Exception:
750
+ return {}
751
+
752
+
753
+ def _escape_pdf_text(s: str) -> str:
754
+ # PDF literal strings escape backslash and parentheses.
755
+ return (s or "").replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)")
756
+
757
+
758
+ def _simple_pdf_bytes(title: str, lines: list[str]) -> bytes:
759
+ """
760
+ Tiny, dependency-free, single-page PDF generator for short text reports.
761
+ """
762
+ font = "Helvetica"
763
+ font_size = 11
764
+ left = 54
765
+ top = 790
766
+ leading = 14
767
+
768
+ safe_title = _escape_pdf_text(title)
769
+ safe_lines = [_escape_pdf_text(ln) for ln in lines]
770
+
771
+ content_lines: list[str] = []
772
+ content_lines.append("BT")
773
+ content_lines.append(f"/F1 {font_size} Tf")
774
+ content_lines.append(f"{left} {top} Td")
775
+ content_lines.append(f"({_escape_pdf_text(safe_title)}) Tj")
776
+ content_lines.append(f"0 -{leading*2} Td")
777
+ for ln in safe_lines:
778
+ content_lines.append(f"({ln}) Tj")
779
+ content_lines.append(f"0 -{leading} Td")
780
+ content_lines.append("ET")
781
+ stream = "\n".join(content_lines).encode("latin-1", errors="replace")
782
+
783
+ objects: list[bytes] = []
784
+ objects.append(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
785
+ objects.append(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n")
786
+ objects.append(
787
+ b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
788
+ b"/Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>\nendobj\n"
789
+ )
790
+ objects.append(f"4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /{font} >>\nendobj\n".encode())
791
+ objects.append(
792
+ b"5 0 obj\n<< /Length " + str(len(stream)).encode() + b" >>\nstream\n" + stream + b"\nendstream\nendobj\n"
793
+ )
794
+
795
+ out = io.BytesIO()
796
+ out.write(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n")
797
+ xref: list[int] = [0]
798
+ for obj in objects:
799
+ xref.append(out.tell())
800
+ out.write(obj)
801
+ xref_start = out.tell()
802
+ out.write(f"xref\n0 {len(xref)}\n".encode())
803
+ out.write(b"0000000000 65535 f \n")
804
+ for off in xref[1:]:
805
+ out.write(f"{off:010d} 00000 n \n".encode())
806
+ out.write(
807
+ b"trailer\n<< /Size "
808
+ + str(len(xref)).encode()
809
+ + b" /Root 1 0 R >>\nstartxref\n"
810
+ + str(xref_start).encode()
811
+ + b"\n%%EOF\n"
812
+ )
813
+ return out.getvalue()
814
+
815
+
816
+ def export_bank_summary_csv(summary_json: str) -> tuple[str | None, str]:
817
+ data = _safe_json_loads(summary_json)
818
+ if not data:
819
+ return None, "**Export error:** Run 'Analyse statement' first."
820
+
821
+ statements = data if isinstance(data, list) else [data]
822
+
823
+ filename = f"bank-statement-summaries_{_dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
824
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="chex_", mode="w", newline="", encoding="utf-8")
825
+ try:
826
+ writer = csv.writer(tmp)
827
+ writer.writerow([
828
+ "statement_index",
829
+ "total_credits",
830
+ "total_debits",
831
+ "largest_transaction",
832
+ "recurring_payments",
833
+ "flags",
834
+ "raw_reasoning",
835
+ ])
836
+
837
+ overall_recurring: list[str] = []
838
+ overall_flags: list[str] = []
839
+ for s in statements:
840
+ if not isinstance(s, dict):
841
+ continue
842
+ for r in (s.get("recurring_payments") or []):
843
+ if r not in overall_recurring:
844
+ overall_recurring.append(r)
845
+ for f in (s.get("flags") or []):
846
+ if f not in overall_flags:
847
+ overall_flags.append(f)
848
+
849
+ for i, s in enumerate(statements, start=1):
850
+ if not isinstance(s, dict):
851
+ continue
852
+ writer.writerow([
853
+ i,
854
+ s.get("total_credits") or "",
855
+ s.get("total_debits") or "",
856
+ s.get("largest_transaction") or "",
857
+ " | ".join(s.get("recurring_payments") or []),
858
+ " | ".join(s.get("flags") or []),
859
+ s.get("raw_reasoning") or "",
860
+ ])
861
+
862
+ # Overall union row
863
+ writer.writerow([
864
+ "overall",
865
+ "",
866
+ "",
867
+ "",
868
+ " | ".join(overall_recurring),
869
+ " | ".join(overall_flags),
870
+ "",
871
+ ])
872
+ finally:
873
+ tmp.close()
874
+
875
+ # Gradio uses the path; name displayed is fine.
876
+ return tmp.name, f"**CSV ready:** `{filename}`"
877
+
878
+
879
+ def export_bank_summary_pdf(summary_json: str) -> tuple[str | None, str]:
880
+ data = _safe_json_loads(summary_json)
881
+ if not data:
882
+ return None, "**Export error:** Run 'Analyse statement' first."
883
+
884
+ statements = data if isinstance(data, list) else [data]
885
 
886
+ title = "CHEX Bank Statement Summary (Multiple)"
887
+ lines: list[str] = [
888
+ f"Generated: {_dt.datetime.now().isoformat(timespec='seconds')}",
889
+ "",
890
+ f"Statements analysed: {len(statements)}",
891
+ "",
892
+ ]
893
+
894
+ overall_recurring: list[str] = []
895
+ overall_flags: list[str] = []
896
+ for s in statements:
897
+ if not isinstance(s, dict):
898
+ continue
899
+ for r in (s.get("recurring_payments") or []):
900
+ if r not in overall_recurring:
901
+ overall_recurring.append(r)
902
+ for f in (s.get("flags") or []):
903
+ if f not in overall_flags:
904
+ overall_flags.append(f)
905
+
906
+ lines += [
907
+ "Overall Recurring Payments:",
908
+ *([f"- {x}" for x in overall_recurring] if overall_recurring else ["- (none)"]),
909
+ "",
910
+ "Overall Flags / Unusual Activity:",
911
+ *([f"- {x}" for x in overall_flags] if overall_flags else ["- (none)"]),
912
+ "",
913
+ ]
914
+
915
+ for i, s in enumerate(statements, start=1):
916
+ if not isinstance(s, dict):
917
+ continue
918
+ lines += [
919
+ f"Statement {i}:",
920
+ f"- Total Credits: {s.get('total_credits') or 'N/A'}",
921
+ f"- Total Debits: {s.get('total_debits') or 'N/A'}",
922
+ f"- Largest Transaction: {s.get('largest_transaction') or 'N/A'}",
923
+ ]
924
+ rr = (s.get("raw_reasoning") or "").strip()
925
+ if rr:
926
+ lines += ["- Model reasoning: " + rr]
927
+ lines.append("")
928
+
929
+ pdf_bytes = _simple_pdf_bytes(title, lines)
930
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", prefix="chex_", mode="wb")
931
+ try:
932
+ tmp.write(pdf_bytes)
933
+ finally:
934
+ tmp.close()
935
+
936
+ filename = f"bank-statement-summaries_{_dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
937
+ return tmp.name, f"**PDF ready:** `{filename}`"
938
 
939
 
940
  def bank_qa(statement_text: str, question: str) -> tuple[str, str, str, str]:
 
945
  )
946
  if not question.strip():
947
  return format_label_html("N/A"), "", "", "Please enter a question."
948
+ if not MLX_SERVER_URL:
949
  return (
950
+ format_label_html("N/A"), "Inference client not initialised", "",
951
+ f"Error: {model_load_error}.",
952
  )
953
 
954
  statement_text = _truncate(statement_text)
955
  messages = _build_bank_messages(statement_text, question)
956
 
957
  for attempt in range(2):
958
+ msgs = _apply_messages(messages, strict=(attempt == 1))
959
  try:
960
+ raw = _run_inference(msgs)
 
 
 
 
961
  result = _parse_model_output(raw, question)
962
  label_html = format_label_html(result.label.value)
963
  answer = result.answer or "(none — information not found in statement)"
 
1047
  *, *::before, *::after { box-sizing: border-box; }
1048
 
1049
  :root {
1050
+ --bg-base: #0B0E14;
1051
+ --bg-grad: linear-gradient(180deg, #0B0E14 0%, #06080C 100%);
1052
+ --bg-elev: #131720;
1053
+ --bg-elev-strong: #191E2B;
1054
+ --bg-sunken: #0E121A;
1055
+ --bg-input: rgba(0,0,0,0.2);
1056
+ --border: rgba(255,255,255,0.06);
1057
+ --border-strong: rgba(255,255,255,0.12);
1058
+ --hairline: rgba(255,255,255,0.03);
1059
+ --fg: #E2E8F0;
1060
+ --fg-muted: #94A3B8;
1061
+ --fg-subtle: #475569;
1062
+ --green: #10B981;
1063
+ --green-bg: rgba(16,185,129,0.10);
1064
+ --green-border: rgba(16,185,129,0.25);
1065
+ --red: #F43F5E;
1066
+ --red-bg: rgba(244,63,94,0.10);
1067
+ --red-border: rgba(244,63,94,0.25);
1068
+ --amber: #F59E0B;
 
 
1069
  --amber-bg: rgba(245,158,11,0.10);
1070
+ --amber-border: rgba(245,158,11,0.25);
1071
+ --blur: 24px;
1072
  --blur-strong: 32px;
1073
+ --shadow-md: 0 1px 0 rgba(255,255,255,0.03) inset,
1074
+ 0 8px 24px rgba(0,0,0,0.4),
1075
+ 0 1px 2px rgba(0,0,0,0.2);
1076
  --radius: 10px;
1077
+ --radius-lg: 14px;
1078
  }
1079
 
1080
  body {
 
1177
  position: sticky;
1178
  top: 0;
1179
  z-index: 100;
1180
+ background: rgba(11, 14, 20, 0.75);
1181
  backdrop-filter: blur(var(--blur-strong)) saturate(160%);
1182
  -webkit-backdrop-filter: blur(var(--blur-strong)) saturate(160%);
1183
  border-bottom: 1px solid var(--hairline);
1184
  }
1185
 
1186
  .chex-logo {
1187
+ width: 24px; height: 24px; border-radius: 6px;
1188
+ background: #E2E8F0;
1189
+ color: #0B0E14; display: grid; place-items: center;
1190
  font-family: 'JetBrains Mono', monospace; font-weight: 700; font-size: 11px;
1191
  letter-spacing: -0.05em;
1192
+ box-shadow: 0 2px 10px rgba(0,0,0,0.5);
1193
  flex-shrink: 0;
1194
  }
1195
 
 
1315
  textarea:focus, input[type="text"]:focus,
1316
  .gradio-container [data-testid="textbox"] textarea:focus,
1317
  .gradio-container [data-testid="textbox"] input:focus {
1318
+ border-color: var(--border-strong) !important; background: var(--bg-elev) !important;
1319
+ box-shadow: 0 0 0 2px rgba(255,255,255,0.05) !important; outline: none !important;
1320
  }
1321
 
1322
  textarea::placeholder, input::placeholder { color: var(--fg-subtle) !important; }
 
1337
 
1338
  .gradio-container button.primary, button.primary {
1339
  background: var(--fg) !important; color: var(--bg-base) !important; border: 1px solid var(--fg) !important;
1340
+ box-shadow: 0 6px 18px rgba(0,0,0,0.4), 0 1px 0 rgba(255,255,255,0.1) inset !important;
1341
  }
1342
+ .gradio-container button.primary:hover, button.primary:hover { opacity: 0.9 !important; box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; }
1343
 
1344
  .gradio-container button.secondary, button.secondary {
1345
+ background: transparent !important; color: var(--fg-muted) !important;
1346
+ border: 1px solid var(--border-strong) !important; box-shadow: none !important;
 
1347
  }
1348
+ .gradio-container button.secondary:hover, button.secondary:hover { background: var(--bg-elev) !important; color: var(--fg) !important; border-color: var(--border-strong) !important; }
1349
 
1350
  button.sm, .gradio-container button[size="sm"], button.small { font-size: 12px !important; padding: 7px 11px !important; }
1351
 
 
1526
  <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="opacity:0.55"><rect x="2" y="5" width="20" height="14" rx="2"/><line x1="2" y1="10" x2="22" y2="10"/></svg>
1527
  Bank Statement
1528
  </span>
1529
+ <span class="chex-card-kicker">paste · pdf · csv · txt · xlsx · ofx</span>
1530
  </div>
1531
  """
1532
 
 
1600
  with gr.Tabs():
1601
  with gr.Tab("Paste text"):
1602
  bank_paste_input = gr.Textbox(
1603
+ label="Bank statement text (supports multiple)",
1604
  lines=20,
1605
+ placeholder=(
1606
+ "Paste one or more bank statements here.\n\n"
1607
+ "If you paste multiple statements, separate them with a line containing only "
1608
+ "`---` (3+ dashes)."
1609
+ "\n\nOr load the sample below…"
1610
+ ),
1611
  show_label=False,
1612
  )
1613
  btn_load_statement = gr.Button("Load sample statement", variant="secondary", size="sm")
1614
  with gr.Tab("Upload PDF"):
1615
+ bank_pdf_input = gr.File(
1616
+ label="PDF bank statement (multiple allowed)",
1617
+ file_types=[".pdf"],
1618
+ file_count="multiple",
1619
+ )
1620
+ bank_pdf_password_input = gr.Textbox(
1621
+ label="PDF password (optional)",
1622
+ type="password",
1623
+ placeholder="Leave blank if PDF is not encrypted",
1624
+ show_label=False,
1625
+ )
1626
  with gr.Tab("Upload CSV"):
1627
+ bank_csv_input = gr.File(
1628
+ label="CSV bank statement (multiple allowed)",
1629
+ file_types=[".csv"],
1630
+ file_count="multiple",
1631
+ )
1632
+ with gr.Tab("Upload TXT"):
1633
+ bank_txt_input = gr.File(
1634
+ label="TXT bank statement (multiple allowed)",
1635
+ file_types=[".txt", ".text"],
1636
+ file_count="multiple",
1637
+ )
1638
+ with gr.Tab("Upload Excel"):
1639
+ bank_xlsx_input = gr.File(
1640
+ label="Excel bank statement (.xlsx, multiple allowed)",
1641
+ file_types=[".xlsx"],
1642
+ file_count="multiple",
1643
+ )
1644
+ with gr.Tab("Upload OFX / QFX"):
1645
+ bank_ofx_input = gr.File(
1646
+ label="OFX / QFX bank statement (multiple allowed)",
1647
+ file_types=[".ofx", ".qfx"],
1648
+ file_count="multiple",
1649
+ )
1650
 
1651
  with gr.Column(scale=11):
1652
  with gr.Group():
1653
  gr.HTML(STATEMENT_RESULTS_HEADER_HTML)
1654
  analyse_stmt_btn = gr.Button("Analyse statement", variant="primary")
1655
  summary_output = gr.Markdown(value="*Run 'Analyse statement' to generate a financial summary.*")
1656
+ with gr.Row():
1657
+ export_csv_btn = gr.Button("Export CSV", variant="secondary", size="sm")
1658
+ export_pdf_btn = gr.Button("Export PDF", variant="secondary", size="sm")
1659
+ export_status = gr.Markdown(value="")
1660
+ export_file = gr.File(label="Download", interactive=False)
1661
  gr.HTML('<div class="chex-divider"></div>')
1662
  gr.HTML('<span class="chex-section-kicker">Ask a question</span>')
1663
  with gr.Row():
 
1675
  bank_reasoning_output = gr.Textbox(label="Reasoning", interactive=False, lines=3)
1676
 
1677
  bank_statement_state = gr.State("")
1678
+ bank_summary_state = gr.State("")
1679
+ # Hidden JSON output for `gradio_client` API usage.
1680
+ bank_api_output = gr.JSON(visible=False)
1681
+ bank_api_question = gr.Textbox(visible=False)
1682
+ bank_api_btn = gr.Button(visible=False)
1683
 
1684
  # ── Tab 03: Benchmark ──────────────────────────────────────────── #
1685
  with gr.Tab("03 Benchmark"):
 
1716
  fn=analyze_contract,
1717
  inputs=[contract_input, question_input],
1718
  outputs=[label_display, answer_output, citation_output, reasoning_output],
1719
+ api_name="contract_analyze",
1720
  )
1721
  question_input.submit(
1722
  fn=analyze_contract,
1723
  inputs=[contract_input, question_input],
1724
  outputs=[label_display, answer_output, citation_output, reasoning_output],
1725
+ api_name="contract_analyze",
1726
  )
1727
 
1728
  btn_load_statement.click(fn=lambda: SAMPLE_STATEMENT, inputs=[], outputs=[bank_paste_input])
1729
 
1730
  analyse_stmt_btn.click(
1731
  fn=analyse_bank_statement,
1732
+ inputs=[
1733
+ bank_paste_input,
1734
+ bank_pdf_input,
1735
+ bank_pdf_password_input,
1736
+ bank_csv_input,
1737
+ bank_txt_input,
1738
+ bank_xlsx_input,
1739
+ bank_ofx_input,
1740
+ ],
1741
+ outputs=[summary_output, bank_statement_state, bank_summary_state],
1742
+ )
1743
+
1744
+ export_csv_btn.click(
1745
+ fn=export_bank_summary_csv,
1746
+ inputs=[bank_summary_state],
1747
+ outputs=[export_file, export_status],
1748
+ )
1749
+ export_pdf_btn.click(
1750
+ fn=export_bank_summary_pdf,
1751
+ inputs=[bank_summary_state],
1752
+ outputs=[export_file, export_status],
1753
  )
1754
 
1755
  bank_ask_btn.click(
 
1763
  outputs=[bank_label_display, bank_answer_output, bank_citation_output, bank_reasoning_output],
1764
  )
1765
 
1766
+ def bank_analyze_api(
1767
+ paste_text: str,
1768
+ pdf_files,
1769
+ pdf_password: str | None,
1770
+ csv_files,
1771
+ txt_files,
1772
+ xlsx_files,
1773
+ ofx_files,
1774
+ question: str | None,
1775
+ ) -> dict:
1776
+ summary_md, combined_text, summary_json = analyse_bank_statement(
1777
+ paste_text,
1778
+ pdf_files,
1779
+ pdf_password,
1780
+ csv_files,
1781
+ txt_files,
1782
+ xlsx_files,
1783
+ ofx_files,
1784
+ )
1785
+
1786
+ qa: dict | None = None
1787
+ if (question or "").strip():
1788
+ label_html, answer, citation, reasoning = bank_qa(combined_text, (question or "").strip())
1789
+ qa = {
1790
+ "label_html": label_html,
1791
+ "answer": answer,
1792
+ "citation": citation,
1793
+ "reasoning": reasoning,
1794
+ }
1795
+
1796
+ return {
1797
+ "summary_markdown": summary_md,
1798
+ "combined_text": combined_text,
1799
+ "summary_json": summary_json,
1800
+ "qa": qa,
1801
+ }
1802
+
1803
+ bank_api_btn.click(
1804
+ fn=bank_analyze_api,
1805
+ inputs=[
1806
+ bank_paste_input,
1807
+ bank_pdf_input,
1808
+ bank_pdf_password_input,
1809
+ bank_csv_input,
1810
+ bank_txt_input,
1811
+ bank_xlsx_input,
1812
+ bank_ofx_input,
1813
+ bank_api_question,
1814
+ ],
1815
+ outputs=[bank_api_output],
1816
+ api_name="bank_analyze",
1817
+ )
1818
+
1819
 
1820
  if __name__ == "__main__":
1821
  demo.launch(show_error=True, theme=gr.themes.Base(), css=CHEX_CSS, ssr_mode=False)
requirements.txt CHANGED
@@ -1,10 +1,5 @@
1
  gradio>=6.0.0
2
- transformers>=4.45.0
3
- peft>=0.12.0
4
- accelerate>=0.33.0
5
- bitsandbytes>=0.43.0
6
- torch>=2.3.0
7
  pydantic>=2.0.0
8
  pandas>=2.0.0
9
  pdfplumber>=0.10.0
10
- huggingface_hub>=0.24.0
 
1
  gradio>=6.0.0
2
+ huggingface_hub>=0.24.0
 
 
 
 
3
  pydantic>=2.0.0
4
  pandas>=2.0.0
5
  pdfplumber>=0.10.0
 
nda.txt → sample_contracts/nda.txt RENAMED
File without changes
service_agreement.txt → sample_contracts/service_agreement.txt RENAMED
File without changes
software_license.txt → sample_contracts/software_license.txt RENAMED
File without changes
sample_statements/sample_statement.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GREENFIELD BANK — PERSONAL CURRENT ACCOUNT
2
+ Account Holder: Jane A. Smith
3
+ Account Number: 12-34-56 78901234
4
+ Statement Period: 01 April 2025 – 30 April 2025
5
+
6
+ Opening Balance: £1,200.00
7
+
8
+ Date Description Credits Debits Balance
9
+ ---------------------------------------------------------------------------
10
+ 03 Apr 25 BACS SALARY - ACME SOLUTIONS LTD £2,500.00 £3,700.00
11
+ 05 Apr 25 DIRECT DEBIT - NETFLIX.COM £9.99 £3,690.01
12
+ 07 Apr 25 CARD PMT - TESCO SUPERSTORE £87.50 £3,602.51
13
+ 10 Apr 25 DIRECT DEBIT - VIRGIN GYM £35.00 £3,567.51
14
+ 12 Apr 25 CARD PMT - AMAZON UK £142.30 £3,425.21
15
+ 15 Apr 25 ATM CASH WITHDRAWAL - HIGH ST £200.00 £3,225.21
16
+ 18 Apr 25 CARD PMT - COSTA COFFEE £4.75 £3,220.46
17
+ 20 Apr 25 STANDING ORDER - BARCLAYS MORTGAGE £850.00 £2,370.46
18
+ 22 Apr 25 DIRECT DEBIT - SPOTIFY £9.99 £2,360.47
19
+ 24 Apr 25 CARD PMT - SAINSBURY'S SUPERSTORE £63.20 £2,297.27
20
+ 25 Apr 25 BANK TRANSFER IN - J. SMITH £300.00 £2,597.27
21
+ 28 Apr 25 CARD PMT - ZARA CLOTHING £55.00 £2,542.27
22
+ 29 Apr 25 DIRECT DEBIT - SKY TV £42.00 £2,500.27
23
+ 30 Apr 25 ATM CASH WITHDRAWAL - AIRPORT £500.00 £2,000.27
24
+ ---------------------------------------------------------------------------
25
+ Closing Balance: £2,000.27
26
+
27
+ Total Credits: £2,800.00
28
+ Total Debits: £1,999.73
29
+
30
+ Suggested questions to try:
31
+ - What was the total salary received this month?
32
+ - Are there any recurring subscription payments?
33
+ - Was there a mortgage payment this month?
34
+ - What was the largest single transaction?
35
+ - Is there a phone bill in this statement?