update
Browse files- .DS_Store +0 -0
- README.md +0 -14
- __pycache__/app.cpython-314.pyc +0 -0
- app.py +762 -262
- requirements.txt +1 -6
- nda.txt → sample_contracts/nda.txt +0 -0
- service_agreement.txt → sample_contracts/service_agreement.txt +0 -0
- software_license.txt → sample_contracts/software_license.txt +0 -0
- sample_statements/sample_statement.txt +35 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
README.md
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: CHEX
|
| 3 |
-
emoji: 🐠
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 6.14.0
|
| 8 |
-
python_version: '3.13'
|
| 9 |
-
app_file: app.py
|
| 10 |
-
pinned: false
|
| 11 |
-
short_description: 'CHEX is a fine-tuned Qwen3.5-9B model trained on AMD MI300X '
|
| 12 |
-
---
|
| 13 |
-
|
| 14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/app.cpython-314.pyc
ADDED
|
Binary file (85.4 kB). View file
|
|
|
app.py
CHANGED
|
@@ -9,10 +9,14 @@ Tab 3: Analyse Bank Statement — paste / upload a bank statement, get a summary
|
|
| 9 |
|
| 10 |
from __future__ import annotations
|
| 11 |
|
|
|
|
|
|
|
| 12 |
import importlib.util
|
|
|
|
| 13 |
import json
|
| 14 |
import os
|
| 15 |
import re
|
|
|
|
| 16 |
from enum import Enum
|
| 17 |
from pathlib import Path
|
| 18 |
from typing import Optional
|
|
@@ -124,7 +128,7 @@ Question: Does this agreement restrict the Recipient from competing with the Dis
|
|
| 124 |
|
| 125 |
BANK_SYSTEM_PROMPT = """\
|
| 126 |
You are a financial analysis assistant specialising in bank statement review. \
|
| 127 |
-
Given a bank statement (plain text, CSV-derived, or PDF-extracted) and either a \
|
| 128 |
summary request or a specific question, produce a single JSON object.
|
| 129 |
|
| 130 |
For SUMMARY mode (question is "SUMMARISE"):
|
|
@@ -200,165 +204,59 @@ def _parse_summary(raw_text: str) -> BankStatementSummary:
|
|
| 200 |
# Model loading
|
| 201 |
# ---------------------------------------------------------------------------
|
| 202 |
|
| 203 |
-
|
| 204 |
-
SAMPLE_DIR
|
| 205 |
-
STATEMENT_DIR
|
| 206 |
|
| 207 |
-
_pipe = None
|
| 208 |
-
_tokenizer = None
|
| 209 |
model_load_error: Optional[str] = None
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
# The repo contains a LoRA adapter — read base model from adapter_config
|
| 217 |
-
from huggingface_hub import hf_hub_download
|
| 218 |
-
import json as _json
|
| 219 |
-
_adapter_cfg_path = hf_hub_download(MODEL_PATH, "adapter_config.json")
|
| 220 |
-
_adapter_cfg = _json.loads(open(_adapter_cfg_path).read())
|
| 221 |
-
BASE_MODEL_PATH = _adapter_cfg.get("base_model_name_or_path", MODEL_PATH)
|
| 222 |
-
print(f"LoRA adapter detected. Base model: {BASE_MODEL_PATH}")
|
| 223 |
-
|
| 224 |
-
print(f"Loading tokenizer from: {MODEL_PATH}")
|
| 225 |
-
_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 226 |
-
if _tokenizer.pad_token is None:
|
| 227 |
-
_tokenizer.pad_token = _tokenizer.eos_token
|
| 228 |
-
|
| 229 |
-
print(f"Loading base model: {BASE_MODEL_PATH}")
|
| 230 |
-
bnb_available = importlib.util.find_spec("bitsandbytes") is not None
|
| 231 |
-
cuda_available = torch.cuda.is_available()
|
| 232 |
-
|
| 233 |
-
if bnb_available and cuda_available:
|
| 234 |
-
from transformers import BitsAndBytesConfig
|
| 235 |
-
bnb_config = BitsAndBytesConfig(
|
| 236 |
-
load_in_4bit=True,
|
| 237 |
-
bnb_4bit_quant_type="nf4",
|
| 238 |
-
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 239 |
-
bnb_4bit_use_double_quant=True,
|
| 240 |
-
)
|
| 241 |
-
_base = AutoModelForCausalLM.from_pretrained(
|
| 242 |
-
BASE_MODEL_PATH,
|
| 243 |
-
quantization_config=bnb_config,
|
| 244 |
-
device_map="auto",
|
| 245 |
-
trust_remote_code=True,
|
| 246 |
-
)
|
| 247 |
-
print(" Base loaded with 4-bit NF4 quantization")
|
| 248 |
-
else:
|
| 249 |
-
dtype = torch.float16 if cuda_available else torch.float32
|
| 250 |
-
_base = AutoModelForCausalLM.from_pretrained(
|
| 251 |
-
BASE_MODEL_PATH,
|
| 252 |
-
torch_dtype=dtype,
|
| 253 |
-
device_map="auto" if cuda_available else None,
|
| 254 |
-
trust_remote_code=True,
|
| 255 |
-
)
|
| 256 |
-
print(f" Base loaded in {'fp16 (GPU)' if cuda_available else 'fp32 (CPU)'}")
|
| 257 |
-
|
| 258 |
-
print(f"Applying LoRA adapter from: {MODEL_PATH}")
|
| 259 |
-
_model = PeftModel.from_pretrained(_base, MODEL_PATH)
|
| 260 |
-
_model.eval()
|
| 261 |
-
print(" LoRA adapter applied")
|
| 262 |
-
|
| 263 |
-
_pipe = pipeline(
|
| 264 |
-
"text-generation",
|
| 265 |
-
model=_model,
|
| 266 |
-
tokenizer=_tokenizer,
|
| 267 |
-
max_new_tokens=512,
|
| 268 |
-
do_sample=False,
|
| 269 |
-
return_full_text=False,
|
| 270 |
-
pad_token_id=_tokenizer.eos_token_id,
|
| 271 |
-
)
|
| 272 |
-
print(f"Model loaded successfully: {MODEL_PATH}")
|
| 273 |
-
|
| 274 |
-
except Exception as e:
|
| 275 |
-
model_load_error = str(e)
|
| 276 |
-
print(f"WARNING: Model failed to load: {e}")
|
| 277 |
-
print("Demo is running in preview mode — analysis will return a placeholder response.")
|
| 278 |
|
| 279 |
|
| 280 |
# ---------------------------------------------------------------------------
|
| 281 |
# Inference helpers
|
| 282 |
# ---------------------------------------------------------------------------
|
| 283 |
|
| 284 |
-
|
| 285 |
|
| 286 |
|
| 287 |
def _truncate(text: str) -> str:
|
| 288 |
-
if
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
if len(tokens) > MAX_TOKENS:
|
| 292 |
-
print(f"WARNING: Text truncated from {len(tokens)} to {MAX_TOKENS} tokens.")
|
| 293 |
-
tokens = tokens[:MAX_TOKENS]
|
| 294 |
-
return _tokenizer.decode(tokens, skip_special_tokens=True)
|
| 295 |
return text
|
| 296 |
|
| 297 |
|
| 298 |
-
def
|
| 299 |
if strict:
|
| 300 |
messages = list(messages)
|
| 301 |
messages[-1] = dict(messages[-1])
|
| 302 |
messages[-1]["content"] += STRICT_SUFFIX
|
| 303 |
-
|
| 304 |
-
try:
|
| 305 |
-
return _tokenizer.apply_chat_template(
|
| 306 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 307 |
-
)
|
| 308 |
-
except Exception:
|
| 309 |
-
pass
|
| 310 |
-
# Fallback: plain text
|
| 311 |
-
parts = []
|
| 312 |
-
for m in messages:
|
| 313 |
-
parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>")
|
| 314 |
-
parts.append("<|im_start|>assistant\n")
|
| 315 |
-
return "\n".join(parts)
|
| 316 |
|
| 317 |
|
| 318 |
-
def
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
# ---------------------------------------------------------------------------
|
| 324 |
-
# MLX Remote Server (Mac Mini via ngrok) - takes priority when MLX_SERVER_URL is set
|
| 325 |
-
# ---------------------------------------------------------------------------
|
| 326 |
-
_MLX_SERVER_URL = os.environ.get("MLX_SERVER_URL", "").rstrip("/")
|
| 327 |
-
_mlx_available = False
|
| 328 |
-
|
| 329 |
-
if _MLX_SERVER_URL:
|
| 330 |
-
try:
|
| 331 |
-
import urllib.request as _ur
|
| 332 |
-
_ur.urlopen(_MLX_SERVER_URL + "/v1/models", timeout=5)
|
| 333 |
-
_mlx_available = True
|
| 334 |
-
print("MLX remote server ready: " + _MLX_SERVER_URL)
|
| 335 |
-
except Exception as _e:
|
| 336 |
-
print("MLX server unreachable (" + str(_e) + "), falling back to local model.")
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
def _run_via_mlx(messages, strict=False):
|
| 340 |
-
import urllib.request as _ur, json as _j
|
| 341 |
-
msgs = list(messages)
|
| 342 |
-
if strict:
|
| 343 |
-
msgs[-1] = dict(msgs[-1])
|
| 344 |
-
msgs[-1]["content"] += STRICT_SUFFIX
|
| 345 |
-
payload = _j.dumps({
|
| 346 |
-
"model": "mlx-community/Qwen3.5-9B-MLX-4bit",
|
| 347 |
-
"messages": msgs,
|
| 348 |
"max_tokens": 512,
|
| 349 |
"temperature": 0.0,
|
| 350 |
}).encode()
|
| 351 |
-
req =
|
| 352 |
-
|
| 353 |
data=payload,
|
| 354 |
headers={"Content-Type": "application/json"},
|
|
|
|
| 355 |
)
|
| 356 |
-
with
|
| 357 |
-
data =
|
| 358 |
return data["choices"][0]["message"]["content"]
|
| 359 |
|
| 360 |
|
| 361 |
-
|
| 362 |
# ---------------------------------------------------------------------------
|
| 363 |
# Sample contract content
|
| 364 |
# ---------------------------------------------------------------------------
|
|
@@ -428,25 +326,21 @@ def analyze_contract(contract_text: str, question: str) -> tuple[str, str, str,
|
|
| 428 |
return format_label_html("N/A"), "", "", "Please paste a contract above."
|
| 429 |
if not question.strip():
|
| 430 |
return format_label_html("N/A"), "", "", "Please enter a question."
|
| 431 |
-
if
|
| 432 |
return (
|
| 433 |
format_label_html("N/A"),
|
| 434 |
"Model not loaded",
|
| 435 |
"",
|
| 436 |
-
f"Model failed to load: {model_load_error}.
|
| 437 |
-
"Set HF_MODEL_REPO in Space secrets to the correct model repo.",
|
| 438 |
)
|
| 439 |
|
| 440 |
contract_text = _truncate(contract_text)
|
| 441 |
messages = _build_contract_messages(contract_text, question)
|
| 442 |
|
| 443 |
for attempt in range(2):
|
|
|
|
| 444 |
try:
|
| 445 |
-
|
| 446 |
-
raw = _run_via_mlx(messages, strict=(attempt == 1))
|
| 447 |
-
else:
|
| 448 |
-
prompt = _apply_template(messages, strict=(attempt == 1))
|
| 449 |
-
raw = _run_pipe(prompt)
|
| 450 |
result = _parse_model_output(raw, question)
|
| 451 |
label_html = format_label_html(result.label.value)
|
| 452 |
answer = result.answer or "(none — clause is absent or not applicable)"
|
|
@@ -466,90 +360,581 @@ def analyze_contract(contract_text: str, question: str) -> tuple[str, str, str,
|
|
| 466 |
)
|
| 467 |
|
| 468 |
|
| 469 |
-
def _get_statement_text(
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
try:
|
| 474 |
if importlib.util.find_spec("pdfplumber") is None:
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
for
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
except Exception as e:
|
| 488 |
-
|
| 489 |
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
return "", "Model not loaded — CSV parsing unavailable."
|
| 493 |
try:
|
| 494 |
import pandas as pd
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
|
| 505 |
-
|
| 506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
|
| 508 |
-
|
|
|
|
|
|
|
|
|
|
| 509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
|
| 511 |
-
|
| 512 |
-
statement_text, error = _get_statement_text(paste_text, pdf_file, csv_file)
|
| 513 |
-
if error:
|
| 514 |
-
return f"**Error:** {error}", ""
|
| 515 |
-
if _pipe is None and not _mlx_available:
|
| 516 |
return (
|
| 517 |
-
f"**
|
| 518 |
-
|
|
|
|
| 519 |
)
|
| 520 |
|
| 521 |
-
|
| 522 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
|
| 554 |
|
| 555 |
def bank_qa(statement_text: str, question: str) -> tuple[str, str, str, str]:
|
|
@@ -560,22 +945,19 @@ def bank_qa(statement_text: str, question: str) -> tuple[str, str, str, str]:
|
|
| 560 |
)
|
| 561 |
if not question.strip():
|
| 562 |
return format_label_html("N/A"), "", "", "Please enter a question."
|
| 563 |
-
if
|
| 564 |
return (
|
| 565 |
-
format_label_html("N/A"), "
|
| 566 |
-
f"
|
| 567 |
)
|
| 568 |
|
| 569 |
statement_text = _truncate(statement_text)
|
| 570 |
messages = _build_bank_messages(statement_text, question)
|
| 571 |
|
| 572 |
for attempt in range(2):
|
|
|
|
| 573 |
try:
|
| 574 |
-
|
| 575 |
-
raw = _run_via_mlx(messages, strict=(attempt == 1))
|
| 576 |
-
else:
|
| 577 |
-
prompt = _apply_template(messages, strict=(attempt == 1))
|
| 578 |
-
raw = _run_pipe(prompt)
|
| 579 |
result = _parse_model_output(raw, question)
|
| 580 |
label_html = format_label_html(result.label.value)
|
| 581 |
answer = result.answer or "(none — information not found in statement)"
|
|
@@ -665,36 +1047,34 @@ CHEX_CSS = """
|
|
| 665 |
*, *::before, *::after { box-sizing: border-box; }
|
| 666 |
|
| 667 |
:root {
|
| 668 |
-
--bg-base: #
|
| 669 |
-
--bg-grad:
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
--bg-
|
| 673 |
-
--bg-
|
| 674 |
-
--
|
| 675 |
-
--
|
| 676 |
-
--
|
| 677 |
-
--
|
| 678 |
-
--
|
| 679 |
-
--fg: #
|
| 680 |
-
--
|
| 681 |
-
--
|
| 682 |
-
--green:
|
| 683 |
-
--
|
| 684 |
-
--
|
| 685 |
-
--red:
|
| 686 |
-
--
|
| 687 |
-
--red-border: rgba(239,68,68,0.28);
|
| 688 |
-
--amber: #b87800;
|
| 689 |
--amber-bg: rgba(245,158,11,0.10);
|
| 690 |
-
--amber-border: rgba(245,158,11,0.
|
| 691 |
-
--blur:
|
| 692 |
--blur-strong: 32px;
|
| 693 |
-
--shadow-md: 0 1px 0 rgba(255,255,255,0.
|
| 694 |
-
0 8px 24px rgba(
|
| 695 |
-
0 1px 2px rgba(
|
| 696 |
--radius: 10px;
|
| 697 |
-
--radius-lg:
|
| 698 |
}
|
| 699 |
|
| 700 |
body {
|
|
@@ -797,19 +1177,19 @@ label.block, .label-wrap {
|
|
| 797 |
position: sticky;
|
| 798 |
top: 0;
|
| 799 |
z-index: 100;
|
| 800 |
-
background:
|
| 801 |
backdrop-filter: blur(var(--blur-strong)) saturate(160%);
|
| 802 |
-webkit-backdrop-filter: blur(var(--blur-strong)) saturate(160%);
|
| 803 |
border-bottom: 1px solid var(--hairline);
|
| 804 |
}
|
| 805 |
|
| 806 |
.chex-logo {
|
| 807 |
-
width:
|
| 808 |
-
background:
|
| 809 |
-
color: #
|
| 810 |
font-family: 'JetBrains Mono', monospace; font-weight: 700; font-size: 11px;
|
| 811 |
letter-spacing: -0.05em;
|
| 812 |
-
box-shadow: 0
|
| 813 |
flex-shrink: 0;
|
| 814 |
}
|
| 815 |
|
|
@@ -935,8 +1315,8 @@ textarea, input[type="text"], input[type="search"],
|
|
| 935 |
textarea:focus, input[type="text"]:focus,
|
| 936 |
.gradio-container [data-testid="textbox"] textarea:focus,
|
| 937 |
.gradio-container [data-testid="textbox"] input:focus {
|
| 938 |
-
border-color: var(--border-strong) !important; background: var(--bg-elev
|
| 939 |
-
box-shadow: 0 0 0
|
| 940 |
}
|
| 941 |
|
| 942 |
textarea::placeholder, input::placeholder { color: var(--fg-subtle) !important; }
|
|
@@ -957,16 +1337,15 @@ textarea[readonly],
|
|
| 957 |
|
| 958 |
.gradio-container button.primary, button.primary {
|
| 959 |
background: var(--fg) !important; color: var(--bg-base) !important; border: 1px solid var(--fg) !important;
|
| 960 |
-
box-shadow: 0 6px 18px rgba(
|
| 961 |
}
|
| 962 |
-
.gradio-container button.primary:hover, button.primary:hover { opacity: 0.
|
| 963 |
|
| 964 |
.gradio-container button.secondary, button.secondary {
|
| 965 |
-
background:
|
| 966 |
-
|
| 967 |
-
border: 1px solid var(--border) !important; box-shadow: var(--shadow-md) !important;
|
| 968 |
}
|
| 969 |
-
.gradio-container button.secondary:hover, button.secondary:hover { background: var(--bg-elev-
|
| 970 |
|
| 971 |
button.sm, .gradio-container button[size="sm"], button.small { font-size: 12px !important; padding: 7px 11px !important; }
|
| 972 |
|
|
@@ -1147,7 +1526,7 @@ STATEMENT_SOURCE_HEADER_HTML = """
|
|
| 1147 |
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="opacity:0.55"><rect x="2" y="5" width="20" height="14" rx="2"/><line x1="2" y1="10" x2="22" y2="10"/></svg>
|
| 1148 |
Bank Statement
|
| 1149 |
</span>
|
| 1150 |
-
<span class="chex-card-kicker">paste · pdf · csv</span>
|
| 1151 |
</div>
|
| 1152 |
"""
|
| 1153 |
|
|
@@ -1221,22 +1600,64 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
|
|
| 1221 |
with gr.Tabs():
|
| 1222 |
with gr.Tab("Paste text"):
|
| 1223 |
bank_paste_input = gr.Textbox(
|
| 1224 |
-
label="Bank statement text",
|
| 1225 |
lines=20,
|
| 1226 |
-
placeholder=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1227 |
show_label=False,
|
| 1228 |
)
|
| 1229 |
btn_load_statement = gr.Button("Load sample statement", variant="secondary", size="sm")
|
| 1230 |
with gr.Tab("Upload PDF"):
|
| 1231 |
-
bank_pdf_input = gr.File(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1232 |
with gr.Tab("Upload CSV"):
|
| 1233 |
-
bank_csv_input = gr.File(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1234 |
|
| 1235 |
with gr.Column(scale=11):
|
| 1236 |
with gr.Group():
|
| 1237 |
gr.HTML(STATEMENT_RESULTS_HEADER_HTML)
|
| 1238 |
analyse_stmt_btn = gr.Button("Analyse statement", variant="primary")
|
| 1239 |
summary_output = gr.Markdown(value="*Run 'Analyse statement' to generate a financial summary.*")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1240 |
gr.HTML('<div class="chex-divider"></div>')
|
| 1241 |
gr.HTML('<span class="chex-section-kicker">Ask a question</span>')
|
| 1242 |
with gr.Row():
|
|
@@ -1254,6 +1675,11 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
|
|
| 1254 |
bank_reasoning_output = gr.Textbox(label="Reasoning", interactive=False, lines=3)
|
| 1255 |
|
| 1256 |
bank_statement_state = gr.State("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1257 |
|
| 1258 |
# ── Tab 03: Benchmark ──────────────────────────────────────────── #
|
| 1259 |
with gr.Tab("03 Benchmark"):
|
|
@@ -1290,19 +1716,40 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
|
|
| 1290 |
fn=analyze_contract,
|
| 1291 |
inputs=[contract_input, question_input],
|
| 1292 |
outputs=[label_display, answer_output, citation_output, reasoning_output],
|
|
|
|
| 1293 |
)
|
| 1294 |
question_input.submit(
|
| 1295 |
fn=analyze_contract,
|
| 1296 |
inputs=[contract_input, question_input],
|
| 1297 |
outputs=[label_display, answer_output, citation_output, reasoning_output],
|
|
|
|
| 1298 |
)
|
| 1299 |
|
| 1300 |
btn_load_statement.click(fn=lambda: SAMPLE_STATEMENT, inputs=[], outputs=[bank_paste_input])
|
| 1301 |
|
| 1302 |
analyse_stmt_btn.click(
|
| 1303 |
fn=analyse_bank_statement,
|
| 1304 |
-
inputs=[
|
| 1305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1306 |
)
|
| 1307 |
|
| 1308 |
bank_ask_btn.click(
|
|
@@ -1316,6 +1763,59 @@ with gr.Blocks(title="CHEX — Document Intelligence") as demo:
|
|
| 1316 |
outputs=[bank_label_display, bank_answer_output, bank_citation_output, bank_reasoning_output],
|
| 1317 |
)
|
| 1318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1319 |
|
| 1320 |
if __name__ == "__main__":
|
| 1321 |
demo.launch(show_error=True, theme=gr.themes.Base(), css=CHEX_CSS, ssr_mode=False)
|
|
|
|
| 9 |
|
| 10 |
from __future__ import annotations
|
| 11 |
|
| 12 |
+
import csv
|
| 13 |
+
import datetime as _dt
|
| 14 |
import importlib.util
|
| 15 |
+
import io
|
| 16 |
import json
|
| 17 |
import os
|
| 18 |
import re
|
| 19 |
+
import tempfile
|
| 20 |
from enum import Enum
|
| 21 |
from pathlib import Path
|
| 22 |
from typing import Optional
|
|
|
|
| 128 |
|
| 129 |
BANK_SYSTEM_PROMPT = """\
|
| 130 |
You are a financial analysis assistant specialising in bank statement review. \
|
| 131 |
+
Given a bank statement (plain text, CSV/Excel-derived, OFX/QFX-derived, or PDF-extracted) and either a \
|
| 132 |
summary request or a specific question, produce a single JSON object.
|
| 133 |
|
| 134 |
For SUMMARY mode (question is "SUMMARISE"):
|
|
|
|
| 204 |
# Model loading
|
| 205 |
# ---------------------------------------------------------------------------
|
| 206 |
|
| 207 |
+
MLX_SERVER_URL = os.environ.get("MLX_SERVER_URL", "").rstrip("/")
|
| 208 |
+
SAMPLE_DIR = Path(__file__).parent / "sample_contracts"
|
| 209 |
+
STATEMENT_DIR = Path(__file__).parent / "sample_statements"
|
| 210 |
|
|
|
|
|
|
|
| 211 |
model_load_error: Optional[str] = None
|
| 212 |
|
| 213 |
+
if not MLX_SERVER_URL:
|
| 214 |
+
model_load_error = "MLX_SERVER_URL not set. Set it in Space secrets to your Mac's ngrok URL."
|
| 215 |
+
print(f"WARNING: {model_load_error}")
|
| 216 |
+
else:
|
| 217 |
+
print(f"MLX server configured at: {MLX_SERVER_URL}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
|
| 220 |
# ---------------------------------------------------------------------------
|
| 221 |
# Inference helpers
|
| 222 |
# ---------------------------------------------------------------------------
|
| 223 |
|
| 224 |
+
MAX_CHARS = 32000 # rough character limit (~8k tokens) to keep requests fast
|
| 225 |
|
| 226 |
|
| 227 |
def _truncate(text: str) -> str:
|
| 228 |
+
if len(text) > MAX_CHARS:
|
| 229 |
+
print(f"WARNING: Text truncated from {len(text)} to {MAX_CHARS} chars.")
|
| 230 |
+
return text[:MAX_CHARS]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
return text
|
| 232 |
|
| 233 |
|
| 234 |
+
def _apply_messages(messages: list[dict], strict: bool = False) -> list[dict]:
|
| 235 |
if strict:
|
| 236 |
messages = list(messages)
|
| 237 |
messages[-1] = dict(messages[-1])
|
| 238 |
messages[-1]["content"] += STRICT_SUFFIX
|
| 239 |
+
return messages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
|
| 242 |
+
def _run_inference(messages: list[dict]) -> str:
|
| 243 |
+
import urllib.request
|
| 244 |
+
payload = json.dumps({
|
| 245 |
+
"messages": messages,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
"max_tokens": 512,
|
| 247 |
"temperature": 0.0,
|
| 248 |
}).encode()
|
| 249 |
+
req = urllib.request.Request(
|
| 250 |
+
f"{MLX_SERVER_URL}/v1/chat/completions",
|
| 251 |
data=payload,
|
| 252 |
headers={"Content-Type": "application/json"},
|
| 253 |
+
method="POST",
|
| 254 |
)
|
| 255 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 256 |
+
data = json.loads(resp.read())
|
| 257 |
return data["choices"][0]["message"]["content"]
|
| 258 |
|
| 259 |
|
|
|
|
| 260 |
# ---------------------------------------------------------------------------
|
| 261 |
# Sample contract content
|
| 262 |
# ---------------------------------------------------------------------------
|
|
|
|
| 326 |
return format_label_html("N/A"), "", "", "Please paste a contract above."
|
| 327 |
if not question.strip():
|
| 328 |
return format_label_html("N/A"), "", "", "Please enter a question."
|
| 329 |
+
if not MLX_SERVER_URL:
|
| 330 |
return (
|
| 331 |
format_label_html("N/A"),
|
| 332 |
"Model not loaded",
|
| 333 |
"",
|
| 334 |
+
f"Model failed to load: {model_load_error}.",
|
|
|
|
| 335 |
)
|
| 336 |
|
| 337 |
contract_text = _truncate(contract_text)
|
| 338 |
messages = _build_contract_messages(contract_text, question)
|
| 339 |
|
| 340 |
for attempt in range(2):
|
| 341 |
+
msgs = _apply_messages(messages, strict=(attempt == 1))
|
| 342 |
try:
|
| 343 |
+
raw = _run_inference(msgs)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
result = _parse_model_output(raw, question)
|
| 345 |
label_html = format_label_html(result.label.value)
|
| 346 |
answer = result.answer or "(none — clause is absent or not applicable)"
|
|
|
|
| 360 |
)
|
| 361 |
|
| 362 |
|
| 363 |
+
def _get_statement_text(
|
| 364 |
+
paste_text: str,
|
| 365 |
+
pdf_file,
|
| 366 |
+
pdf_password: str | None,
|
| 367 |
+
csv_file,
|
| 368 |
+
txt_file,
|
| 369 |
+
xlsx_file,
|
| 370 |
+
ofx_file,
|
| 371 |
+
) -> tuple[str, str]:
|
| 372 |
+
# Backwards-compatible shim: treat "single statement" inputs as one item.
|
| 373 |
+
texts, errors = _get_statement_texts(
|
| 374 |
+
paste_text,
|
| 375 |
+
pdf_file,
|
| 376 |
+
pdf_password,
|
| 377 |
+
csv_file,
|
| 378 |
+
txt_file,
|
| 379 |
+
xlsx_file,
|
| 380 |
+
ofx_file,
|
| 381 |
+
)
|
| 382 |
+
if not texts:
|
| 383 |
+
return (
|
| 384 |
+
"",
|
| 385 |
+
errors[0]
|
| 386 |
+
if errors
|
| 387 |
+
else "Please paste a bank statement or upload a PDF / CSV / TXT / XLSX / OFX/QFX file."
|
| 388 |
+
)
|
| 389 |
+
return texts[0], ""
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def _ensure_file_list(files) -> list:
|
| 393 |
+
if files is None:
|
| 394 |
+
return []
|
| 395 |
+
if isinstance(files, (list, tuple)):
|
| 396 |
+
return [f for f in files if f is not None]
|
| 397 |
+
return [files]
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
def _split_statements(paste_text: str) -> list[str]:
|
| 401 |
+
"""
|
| 402 |
+
Split pasted content into multiple statements.
|
| 403 |
+
|
| 404 |
+
Delimiter: a line containing only `---` (3+ dashes), optionally surrounded by whitespace.
|
| 405 |
+
"""
|
| 406 |
+
text = (paste_text or "").strip()
|
| 407 |
+
if not text:
|
| 408 |
+
return []
|
| 409 |
+
parts = re.split(r"(?m)^[ \t]*-{3,}[ \t]*$", text)
|
| 410 |
+
cleaned = [p.strip() for p in parts if p.strip()]
|
| 411 |
+
return cleaned if cleaned else [text]
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
def _get_statement_texts(
|
| 415 |
+
paste_text: str,
|
| 416 |
+
pdf_files,
|
| 417 |
+
pdf_password: str | None,
|
| 418 |
+
csv_files,
|
| 419 |
+
txt_files,
|
| 420 |
+
xlsx_files,
|
| 421 |
+
ofx_files,
|
| 422 |
+
) -> tuple[list[str], list[str]]:
|
| 423 |
+
"""
|
| 424 |
+
Extract statement text blocks from:
|
| 425 |
+
- pasted text (can contain multiple statements separated by `---`)
|
| 426 |
+
- uploaded PDFs (supports multiple)
|
| 427 |
+
- uploaded CSVs (supports multiple)
|
| 428 |
+
- uploaded TXT files (supports multiple)
|
| 429 |
+
- uploaded Excel (.xlsx) (supports multiple)
|
| 430 |
+
- uploaded OFX/QFX files (supports multiple)
|
| 431 |
+
"""
|
| 432 |
+
statement_texts: list[str] = []
|
| 433 |
+
errors: list[str] = []
|
| 434 |
+
|
| 435 |
+
pdf_list = _ensure_file_list(pdf_files)
|
| 436 |
+
csv_list = _ensure_file_list(csv_files)
|
| 437 |
+
txt_list = _ensure_file_list(txt_files)
|
| 438 |
+
xlsx_list = _ensure_file_list(xlsx_files)
|
| 439 |
+
ofx_list = _ensure_file_list(ofx_files)
|
| 440 |
+
|
| 441 |
+
# PDFs
|
| 442 |
+
if pdf_list:
|
| 443 |
try:
|
| 444 |
if importlib.util.find_spec("pdfplumber") is None:
|
| 445 |
+
errors.append("pdfplumber not installed.")
|
| 446 |
+
else:
|
| 447 |
+
import pdfplumber
|
| 448 |
+
password = (pdf_password or "").strip()
|
| 449 |
+
for idx, pdf_file in enumerate(pdf_list):
|
| 450 |
+
try:
|
| 451 |
+
text_parts: list[str] = []
|
| 452 |
+
try:
|
| 453 |
+
with pdfplumber.open(
|
| 454 |
+
str(pdf_file),
|
| 455 |
+
password=password if password else "",
|
| 456 |
+
) as pdf:
|
| 457 |
+
for page in pdf.pages:
|
| 458 |
+
t = page.extract_text()
|
| 459 |
+
if t:
|
| 460 |
+
text_parts.append(t)
|
| 461 |
+
except TypeError:
|
| 462 |
+
# Older pdfplumber versions may not accept `password=...`
|
| 463 |
+
with pdfplumber.open(str(pdf_file)) as pdf:
|
| 464 |
+
for page in pdf.pages:
|
| 465 |
+
t = page.extract_text()
|
| 466 |
+
if t:
|
| 467 |
+
text_parts.append(t)
|
| 468 |
+
text = "\n".join(text_parts).strip()
|
| 469 |
+
if not text:
|
| 470 |
+
errors.append(f"PDF #{idx+1} uploaded but no text could be extracted.")
|
| 471 |
+
else:
|
| 472 |
+
statement_texts.append(text)
|
| 473 |
+
except Exception as e:
|
| 474 |
+
msg = str(e).lower()
|
| 475 |
+
if "password" in msg or "encrypted" in msg or "decrypt" in msg:
|
| 476 |
+
errors.append(
|
| 477 |
+
f"PDF #{idx+1} is password-protected. Please enter the correct password."
|
| 478 |
+
)
|
| 479 |
+
else:
|
| 480 |
+
errors.append(f"PDF #{idx+1} extraction error: {e}")
|
| 481 |
except Exception as e:
|
| 482 |
+
errors.append(f"PDF extraction error: {e}")
|
| 483 |
|
| 484 |
+
# CSVs
|
| 485 |
+
if csv_list:
|
|
|
|
| 486 |
try:
|
| 487 |
import pandas as pd
|
| 488 |
+
except Exception:
|
| 489 |
+
if importlib.util.find_spec("pandas") is None:
|
| 490 |
+
errors.append("pandas not installed.")
|
| 491 |
+
else:
|
| 492 |
+
errors.append("CSV parsing error: pandas import failed.")
|
| 493 |
+
else:
|
| 494 |
+
for idx, csv_file in enumerate(csv_list):
|
| 495 |
+
try:
|
| 496 |
+
df = pd.read_csv(str(csv_file))
|
| 497 |
+
df.columns = [c.strip().lower() for c in df.columns]
|
| 498 |
+
lines: list[str] = []
|
| 499 |
+
for _, row in df.iterrows():
|
| 500 |
+
parts = [
|
| 501 |
+
str(v).strip()
|
| 502 |
+
for v in row.values
|
| 503 |
+
if str(v).strip() not in ("", "nan")
|
| 504 |
+
]
|
| 505 |
+
lines.append(", ".join(parts))
|
| 506 |
+
statement_texts.append(
|
| 507 |
+
", ".join(df.columns.tolist()) + "\n" + "\n".join(lines)
|
| 508 |
+
)
|
| 509 |
+
except Exception as e:
|
| 510 |
+
errors.append(f"CSV #{idx+1} parsing error: {e}")
|
| 511 |
+
|
| 512 |
+
# TXT
|
| 513 |
+
if txt_list:
|
| 514 |
+
for idx, txt_file in enumerate(txt_list):
|
| 515 |
+
try:
|
| 516 |
+
# Read best-effort encoding; then reuse the same delimiter splitting
|
| 517 |
+
# strategy as pasted input.
|
| 518 |
+
p = Path(str(txt_file))
|
| 519 |
+
content = p.read_text(encoding="utf-8", errors="replace")
|
| 520 |
+
parts = _split_statements(content)
|
| 521 |
+
if not parts:
|
| 522 |
+
errors.append(f"TXT #{idx+1} uploaded but no text could be read.")
|
| 523 |
+
else:
|
| 524 |
+
statement_texts.extend(parts)
|
| 525 |
+
except Exception as e:
|
| 526 |
+
errors.append(f"TXT #{idx+1} parsing error: {e}")
|
| 527 |
+
|
| 528 |
+
# XLSX (Excel)
|
| 529 |
+
if xlsx_list:
|
| 530 |
+
try:
|
| 531 |
+
import pandas as pd
|
| 532 |
+
except Exception:
|
| 533 |
+
if importlib.util.find_spec("pandas") is None:
|
| 534 |
+
errors.append("pandas not installed.")
|
| 535 |
+
else:
|
| 536 |
+
errors.append("Excel parsing error: pandas import failed.")
|
| 537 |
+
else:
|
| 538 |
+
for idx, xlsx_file in enumerate(xlsx_list):
|
| 539 |
+
try:
|
| 540 |
+
df = pd.read_excel(str(xlsx_file), sheet_name=0)
|
| 541 |
+
if df is None or df.empty:
|
| 542 |
+
errors.append(f"XLSX #{idx+1} uploaded but no rows were found.")
|
| 543 |
+
continue
|
| 544 |
+
df.columns = [str(c).strip().lower() for c in df.columns]
|
| 545 |
+
lines: list[str] = []
|
| 546 |
+
for _, row in df.iterrows():
|
| 547 |
+
parts = [
|
| 548 |
+
str(v).strip()
|
| 549 |
+
for v in row.values
|
| 550 |
+
if str(v).strip() not in ("", "nan", "NaN")
|
| 551 |
+
]
|
| 552 |
+
lines.append(", ".join(parts))
|
| 553 |
+
statement_texts.append(
|
| 554 |
+
", ".join(df.columns.tolist()) + "\n" + "\n".join(lines)
|
| 555 |
+
)
|
| 556 |
+
except Exception as e:
|
| 557 |
+
errors.append(f"XLSX #{idx+1} parsing error: {e}")
|
| 558 |
+
|
| 559 |
+
# OFX/QFX (lightweight tag extraction)
|
| 560 |
+
if ofx_list:
|
| 561 |
+
def _format_ofx_date(d: str) -> str:
|
| 562 |
+
d = (d or "").strip()
|
| 563 |
+
if len(d) == 8 and d.isdigit():
|
| 564 |
+
return f"{d[:4]}-{d[4:6]}-{d[6:]}"
|
| 565 |
+
return d
|
| 566 |
+
|
| 567 |
+
for idx, ofx_file in enumerate(ofx_list):
|
| 568 |
+
try:
|
| 569 |
+
p = Path(str(ofx_file))
|
| 570 |
+
raw = p.read_bytes()
|
| 571 |
+
try:
|
| 572 |
+
content = raw.decode("utf-8")
|
| 573 |
+
except UnicodeDecodeError:
|
| 574 |
+
content = raw.decode("utf-8", errors="replace")
|
| 575 |
+
|
| 576 |
+
blocks = re.findall(
|
| 577 |
+
r"<STMTTRN>(.*?)</STMTTRN>",
|
| 578 |
+
content,
|
| 579 |
+
flags=re.IGNORECASE | re.DOTALL,
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
def _get_tag(block: str, tag: str) -> str:
|
| 583 |
+
m = re.search(rf"<{tag}>([^<]*)", block, flags=re.IGNORECASE)
|
| 584 |
+
return (m.group(1) if m else "").strip()
|
| 585 |
+
|
| 586 |
+
lines: list[str] = []
|
| 587 |
+
for b in blocks:
|
| 588 |
+
dt = _get_tag(b, "DTPOSTED") or _get_tag(b, "DTTRAN")
|
| 589 |
+
name = _get_tag(b, "NAME") or _get_tag(b, "PAYEE")
|
| 590 |
+
memo = _get_tag(b, "MEMO") or _get_tag(b, "TRNTYPE")
|
| 591 |
+
amt = _get_tag(b, "TRNAMT") or _get_tag(b, "AMOUNT")
|
| 592 |
+
|
| 593 |
+
if not any([dt, name, memo, amt]):
|
| 594 |
+
continue
|
| 595 |
+
|
| 596 |
+
dt = _format_ofx_date(dt)
|
| 597 |
+
desc_parts = [p for p in [name, memo] if p]
|
| 598 |
+
desc = " - ".join(desc_parts) if desc_parts else "Transaction"
|
| 599 |
+
lines.append(f"{dt}, {desc}, {amt}".strip(", "))
|
| 600 |
+
|
| 601 |
+
if lines:
|
| 602 |
+
statement_texts.append("Date, Description, Amount\n" + "\n".join(lines))
|
| 603 |
+
else:
|
| 604 |
+
# Fall back to returning the raw content (truncated).
|
| 605 |
+
statement_texts.append(content.strip()[:20000])
|
| 606 |
+
except Exception as e:
|
| 607 |
+
errors.append(f"OFX/QFX #{idx+1} parsing error: {e}")
|
| 608 |
+
|
| 609 |
+
# Paste text (may contain multiple statements)
|
| 610 |
+
pasted_parts = _split_statements(paste_text)
|
| 611 |
+
if pasted_parts:
|
| 612 |
+
statement_texts.extend(pasted_parts)
|
| 613 |
+
|
| 614 |
+
if not statement_texts:
|
| 615 |
+
errors.append(
|
| 616 |
+
"Please paste a bank statement or upload a PDF / CSV / TXT / XLSX / OFX/QFX file(s)."
|
| 617 |
+
)
|
| 618 |
|
| 619 |
+
return statement_texts, errors
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
def analyse_bank_statement(
|
| 623 |
+
paste_text: str,
|
| 624 |
+
pdf_file,
|
| 625 |
+
pdf_password: str | None,
|
| 626 |
+
csv_file,
|
| 627 |
+
txt_file,
|
| 628 |
+
xlsx_file,
|
| 629 |
+
ofx_file,
|
| 630 |
+
) -> tuple[str, str, str]:
|
| 631 |
+
statement_texts, errors = _get_statement_texts(
|
| 632 |
+
paste_text,
|
| 633 |
+
pdf_file,
|
| 634 |
+
pdf_password,
|
| 635 |
+
csv_file,
|
| 636 |
+
txt_file,
|
| 637 |
+
xlsx_file,
|
| 638 |
+
ofx_file,
|
| 639 |
+
)
|
| 640 |
+
if not statement_texts:
|
| 641 |
+
return f"**Error:** {errors[0] if errors else 'No bank statement provided.'}", "", ""
|
| 642 |
|
| 643 |
+
MAX_STATEMENTS = 6
|
| 644 |
+
if len(statement_texts) > MAX_STATEMENTS:
|
| 645 |
+
errors.append(f"Too many statements provided; only the first {MAX_STATEMENTS} were used.")
|
| 646 |
+
statement_texts = statement_texts[:MAX_STATEMENTS]
|
| 647 |
|
| 648 |
+
combined_text = "\n\n".join(
|
| 649 |
+
f"===== Statement {i+1}/{len(statement_texts)} =====\n\n{st.strip()}"
|
| 650 |
+
for i, st in enumerate(statement_texts)
|
| 651 |
+
if st.strip()
|
| 652 |
+
).strip()
|
| 653 |
|
| 654 |
+
if not MLX_SERVER_URL:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
return (
|
| 656 |
+
f"**Inference client not initialised.** Error: {model_load_error}",
|
| 657 |
+
combined_text,
|
| 658 |
+
"",
|
| 659 |
)
|
| 660 |
|
| 661 |
+
summaries: list[BankStatementSummary] = []
|
| 662 |
+
for idx, statement_text in enumerate(statement_texts):
|
| 663 |
+
statement_text = _truncate(statement_text)
|
| 664 |
+
messages = _build_bank_messages(statement_text, "SUMMARISE")
|
| 665 |
+
|
| 666 |
+
summary: BankStatementSummary | None = None
|
| 667 |
+
for attempt in range(2):
|
| 668 |
+
msgs = _apply_messages(messages, strict=(attempt == 1))
|
| 669 |
+
try:
|
| 670 |
+
raw = _run_inference(msgs)
|
| 671 |
+
summary = _parse_summary(raw)
|
| 672 |
+
break
|
| 673 |
+
except Exception as e:
|
| 674 |
+
if attempt == 0:
|
| 675 |
+
print(f" Summary parse attempt 1 failed (statement {idx+1}, {e}). Retrying...")
|
| 676 |
+
else:
|
| 677 |
+
print(f" Summary parse attempt 2 failed (statement {idx+1}, {e}). Returning error.")
|
| 678 |
+
|
| 679 |
+
if summary is None:
|
| 680 |
+
summary = BankStatementSummary(
|
| 681 |
+
raw_reasoning=f"Could not parse model output for statement {idx+1}."
|
| 682 |
+
)
|
| 683 |
+
summaries.append(summary)
|
| 684 |
+
|
| 685 |
+
# Render markdown
|
| 686 |
+
lines: list[str] = []
|
| 687 |
+
lines.append("## Statements Summary")
|
| 688 |
+
lines.append("")
|
| 689 |
+
if errors:
|
| 690 |
+
lines.append("**Notes:**")
|
| 691 |
+
for e in errors:
|
| 692 |
+
lines.append(f"- {e}")
|
| 693 |
+
lines.append("")
|
| 694 |
+
|
| 695 |
+
for idx, summary in enumerate(summaries):
|
| 696 |
+
lines.append(f"### Statement {idx+1}")
|
| 697 |
+
lines.append(f"**Total Credits:** {summary.total_credits or 'N/A'}")
|
| 698 |
+
lines.append(f"**Total Debits:** {summary.total_debits or 'N/A'}")
|
| 699 |
+
lines.append(
|
| 700 |
+
f"**Largest Transaction:** {summary.largest_transaction or 'N/A'}"
|
| 701 |
+
)
|
| 702 |
+
if summary.recurring_payments:
|
| 703 |
+
lines.append("\n**Recurring Payments:**")
|
| 704 |
+
for p in summary.recurring_payments:
|
| 705 |
+
lines.append(f"- {p}")
|
| 706 |
+
if summary.flags:
|
| 707 |
+
lines.append("\n**Flags / Unusual Activity:**")
|
| 708 |
+
for f in summary.flags:
|
| 709 |
+
lines.append(f"- {f}")
|
| 710 |
+
lines.append(f"\n*{summary.raw_reasoning}*")
|
| 711 |
+
lines.append("")
|
| 712 |
+
|
| 713 |
+
# Overall union (useful across multiple statements)
|
| 714 |
+
overall_recurring: list[str] = []
|
| 715 |
+
overall_flags: list[str] = []
|
| 716 |
+
for s in summaries:
|
| 717 |
+
for r in (s.recurring_payments or []):
|
| 718 |
+
if r not in overall_recurring:
|
| 719 |
+
overall_recurring.append(r)
|
| 720 |
+
for f in (s.flags or []):
|
| 721 |
+
if f not in overall_flags:
|
| 722 |
+
overall_flags.append(f)
|
| 723 |
+
|
| 724 |
+
lines.append("## Overall (union across statements)")
|
| 725 |
+
if overall_recurring:
|
| 726 |
+
lines.append("\n**Recurring Payments (union):**")
|
| 727 |
+
for p in overall_recurring:
|
| 728 |
+
lines.append(f"- {p}")
|
| 729 |
+
else:
|
| 730 |
+
lines.append("\n**Recurring Payments (union):** N/A")
|
| 731 |
|
| 732 |
+
if overall_flags:
|
| 733 |
+
lines.append("\n**Flags / Unusual Activity (union):**")
|
| 734 |
+
for f in overall_flags:
|
| 735 |
+
lines.append(f"- {f}")
|
| 736 |
+
else:
|
| 737 |
+
lines.append("\n**Flags / Unusual Activity (union):** N/A")
|
| 738 |
+
|
| 739 |
+
summary_json = json.dumps([s.model_dump() for s in summaries], ensure_ascii=False)
|
| 740 |
+
return "\n".join(lines).strip(), combined_text, summary_json
|
| 741 |
+
|
| 742 |
+
|
| 743 |
+
def _safe_json_loads(s: str) -> object:
|
| 744 |
+
try:
|
| 745 |
+
obj = json.loads(s or "")
|
| 746 |
+
if isinstance(obj, (dict, list)):
|
| 747 |
+
return obj
|
| 748 |
+
return {}
|
| 749 |
+
except Exception:
|
| 750 |
+
return {}
|
| 751 |
+
|
| 752 |
+
|
| 753 |
+
def _escape_pdf_text(s: str) -> str:
|
| 754 |
+
# PDF literal strings escape backslash and parentheses.
|
| 755 |
+
return (s or "").replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)")
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
def _simple_pdf_bytes(title: str, lines: list[str]) -> bytes:
|
| 759 |
+
"""
|
| 760 |
+
Tiny, dependency-free, single-page PDF generator for short text reports.
|
| 761 |
+
"""
|
| 762 |
+
font = "Helvetica"
|
| 763 |
+
font_size = 11
|
| 764 |
+
left = 54
|
| 765 |
+
top = 790
|
| 766 |
+
leading = 14
|
| 767 |
+
|
| 768 |
+
safe_title = _escape_pdf_text(title)
|
| 769 |
+
safe_lines = [_escape_pdf_text(ln) for ln in lines]
|
| 770 |
+
|
| 771 |
+
content_lines: list[str] = []
|
| 772 |
+
content_lines.append("BT")
|
| 773 |
+
content_lines.append(f"/F1 {font_size} Tf")
|
| 774 |
+
content_lines.append(f"{left} {top} Td")
|
| 775 |
+
content_lines.append(f"({_escape_pdf_text(safe_title)}) Tj")
|
| 776 |
+
content_lines.append(f"0 -{leading*2} Td")
|
| 777 |
+
for ln in safe_lines:
|
| 778 |
+
content_lines.append(f"({ln}) Tj")
|
| 779 |
+
content_lines.append(f"0 -{leading} Td")
|
| 780 |
+
content_lines.append("ET")
|
| 781 |
+
stream = "\n".join(content_lines).encode("latin-1", errors="replace")
|
| 782 |
+
|
| 783 |
+
objects: list[bytes] = []
|
| 784 |
+
objects.append(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
|
| 785 |
+
objects.append(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n")
|
| 786 |
+
objects.append(
|
| 787 |
+
b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
|
| 788 |
+
b"/Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>\nendobj\n"
|
| 789 |
+
)
|
| 790 |
+
objects.append(f"4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /{font} >>\nendobj\n".encode())
|
| 791 |
+
objects.append(
|
| 792 |
+
b"5 0 obj\n<< /Length " + str(len(stream)).encode() + b" >>\nstream\n" + stream + b"\nendstream\nendobj\n"
|
| 793 |
+
)
|
| 794 |
+
|
| 795 |
+
out = io.BytesIO()
|
| 796 |
+
out.write(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n")
|
| 797 |
+
xref: list[int] = [0]
|
| 798 |
+
for obj in objects:
|
| 799 |
+
xref.append(out.tell())
|
| 800 |
+
out.write(obj)
|
| 801 |
+
xref_start = out.tell()
|
| 802 |
+
out.write(f"xref\n0 {len(xref)}\n".encode())
|
| 803 |
+
out.write(b"0000000000 65535 f \n")
|
| 804 |
+
for off in xref[1:]:
|
| 805 |
+
out.write(f"{off:010d} 00000 n \n".encode())
|
| 806 |
+
out.write(
|
| 807 |
+
b"trailer\n<< /Size "
|
| 808 |
+
+ str(len(xref)).encode()
|
| 809 |
+
+ b" /Root 1 0 R >>\nstartxref\n"
|
| 810 |
+
+ str(xref_start).encode()
|
| 811 |
+
+ b"\n%%EOF\n"
|
| 812 |
+
)
|
| 813 |
+
return out.getvalue()
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
def export_bank_summary_csv(summary_json: str) -> tuple[str | None, str]:
|
| 817 |
+
data = _safe_json_loads(summary_json)
|
| 818 |
+
if not data:
|
| 819 |
+
return None, "**Export error:** Run 'Analyse statement' first."
|
| 820 |
+
|
| 821 |
+
statements = data if isinstance(data, list) else [data]
|
| 822 |
+
|
| 823 |
+
filename = f"bank-statement-summaries_{_dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
| 824 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="chex_", mode="w", newline="", encoding="utf-8")
|
| 825 |
+
try:
|
| 826 |
+
writer = csv.writer(tmp)
|
| 827 |
+
writer.writerow([
|
| 828 |
+
"statement_index",
|
| 829 |
+
"total_credits",
|
| 830 |
+
"total_debits",
|
| 831 |
+
"largest_transaction",
|
| 832 |
+
"recurring_payments",
|
| 833 |
+
"flags",
|
| 834 |
+
"raw_reasoning",
|
| 835 |
+
])
|
| 836 |
+
|
| 837 |
+
overall_recurring: list[str] = []
|
| 838 |
+
overall_flags: list[str] = []
|
| 839 |
+
for s in statements:
|
| 840 |
+
if not isinstance(s, dict):
|
| 841 |
+
continue
|
| 842 |
+
for r in (s.get("recurring_payments") or []):
|
| 843 |
+
if r not in overall_recurring:
|
| 844 |
+
overall_recurring.append(r)
|
| 845 |
+
for f in (s.get("flags") or []):
|
| 846 |
+
if f not in overall_flags:
|
| 847 |
+
overall_flags.append(f)
|
| 848 |
+
|
| 849 |
+
for i, s in enumerate(statements, start=1):
|
| 850 |
+
if not isinstance(s, dict):
|
| 851 |
+
continue
|
| 852 |
+
writer.writerow([
|
| 853 |
+
i,
|
| 854 |
+
s.get("total_credits") or "",
|
| 855 |
+
s.get("total_debits") or "",
|
| 856 |
+
s.get("largest_transaction") or "",
|
| 857 |
+
" | ".join(s.get("recurring_payments") or []),
|
| 858 |
+
" | ".join(s.get("flags") or []),
|
| 859 |
+
s.get("raw_reasoning") or "",
|
| 860 |
+
])
|
| 861 |
+
|
| 862 |
+
# Overall union row
|
| 863 |
+
writer.writerow([
|
| 864 |
+
"overall",
|
| 865 |
+
"",
|
| 866 |
+
"",
|
| 867 |
+
"",
|
| 868 |
+
" | ".join(overall_recurring),
|
| 869 |
+
" | ".join(overall_flags),
|
| 870 |
+
"",
|
| 871 |
+
])
|
| 872 |
+
finally:
|
| 873 |
+
tmp.close()
|
| 874 |
+
|
| 875 |
+
# Gradio uses the path; name displayed is fine.
|
| 876 |
+
return tmp.name, f"**CSV ready:** `{filename}`"
|
| 877 |
+
|
| 878 |
+
|
| 879 |
+
def export_bank_summary_pdf(summary_json: str) -> tuple[str | None, str]:
|
| 880 |
+
data = _safe_json_loads(summary_json)
|
| 881 |
+
if not data:
|
| 882 |
+
return None, "**Export error:** Run 'Analyse statement' first."
|
| 883 |
+
|
| 884 |
+
statements = data if isinstance(data, list) else [data]
|
| 885 |
|
| 886 |
+
title = "CHEX — Bank Statement Summary (Multiple)"
|
| 887 |
+
lines: list[str] = [
|
| 888 |
+
f"Generated: {_dt.datetime.now().isoformat(timespec='seconds')}",
|
| 889 |
+
"",
|
| 890 |
+
f"Statements analysed: {len(statements)}",
|
| 891 |
+
"",
|
| 892 |
+
]
|
| 893 |
+
|
| 894 |
+
overall_recurring: list[str] = []
|
| 895 |
+
overall_flags: list[str] = []
|
| 896 |
+
for s in statements:
|
| 897 |
+
if not isinstance(s, dict):
|
| 898 |
+
continue
|
| 899 |
+
for r in (s.get("recurring_payments") or []):
|
| 900 |
+
if r not in overall_recurring:
|
| 901 |
+
overall_recurring.append(r)
|
| 902 |
+
for f in (s.get("flags") or []):
|
| 903 |
+
if f not in overall_flags:
|
| 904 |
+
overall_flags.append(f)
|
| 905 |
+
|
| 906 |
+
lines += [
|
| 907 |
+
"Overall Recurring Payments:",
|
| 908 |
+
*([f"- {x}" for x in overall_recurring] if overall_recurring else ["- (none)"]),
|
| 909 |
+
"",
|
| 910 |
+
"Overall Flags / Unusual Activity:",
|
| 911 |
+
*([f"- {x}" for x in overall_flags] if overall_flags else ["- (none)"]),
|
| 912 |
+
"",
|
| 913 |
+
]
|
| 914 |
+
|
| 915 |
+
for i, s in enumerate(statements, start=1):
|
| 916 |
+
if not isinstance(s, dict):
|
| 917 |
+
continue
|
| 918 |
+
lines += [
|
| 919 |
+
f"Statement {i}:",
|
| 920 |
+
f"- Total Credits: {s.get('total_credits') or 'N/A'}",
|
| 921 |
+
f"- Total Debits: {s.get('total_debits') or 'N/A'}",
|
| 922 |
+
f"- Largest Transaction: {s.get('largest_transaction') or 'N/A'}",
|
| 923 |
+
]
|
| 924 |
+
rr = (s.get("raw_reasoning") or "").strip()
|
| 925 |
+
if rr:
|
| 926 |
+
lines += ["- Model reasoning: " + rr]
|
| 927 |
+
lines.append("")
|
| 928 |
+
|
| 929 |
+
pdf_bytes = _simple_pdf_bytes(title, lines)
|
| 930 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", prefix="chex_", mode="wb")
|
| 931 |
+
try:
|
| 932 |
+
tmp.write(pdf_bytes)
|
| 933 |
+
finally:
|
| 934 |
+
tmp.close()
|
| 935 |
+
|
| 936 |
+
filename = f"bank-statement-summaries_{_dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
|
| 937 |
+
return tmp.name, f"**PDF ready:** `{filename}`"
|
| 938 |
|
| 939 |
|
| 940 |
def bank_qa(statement_text: str, question: str) -> tuple[str, str, str, str]:
|
|
|
|
| 945 |
)
|
| 946 |
if not question.strip():
|
| 947 |
return format_label_html("N/A"), "", "", "Please enter a question."
|
| 948 |
+
if not MLX_SERVER_URL:
|
| 949 |
return (
|
| 950 |
+
format_label_html("N/A"), "Inference client not initialised", "",
|
| 951 |
+
f"Error: {model_load_error}.",
|
| 952 |
)
|
| 953 |
|
| 954 |
statement_text = _truncate(statement_text)
|
| 955 |
messages = _build_bank_messages(statement_text, question)
|
| 956 |
|
| 957 |
for attempt in range(2):
|
| 958 |
+
msgs = _apply_messages(messages, strict=(attempt == 1))
|
| 959 |
try:
|
| 960 |
+
raw = _run_inference(msgs)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
result = _parse_model_output(raw, question)
|
| 962 |
label_html = format_label_html(result.label.value)
|
| 963 |
answer = result.answer or "(none — information not found in statement)"
|
|
|
|
| 1047 |
*, *::before, *::after { box-sizing: border-box; }
|
| 1048 |
|
| 1049 |
:root {
|
| 1050 |
+
--bg-base: #0B0E14;
|
| 1051 |
+
--bg-grad: linear-gradient(180deg, #0B0E14 0%, #06080C 100%);
|
| 1052 |
+
--bg-elev: #131720;
|
| 1053 |
+
--bg-elev-strong: #191E2B;
|
| 1054 |
+
--bg-sunken: #0E121A;
|
| 1055 |
+
--bg-input: rgba(0,0,0,0.2);
|
| 1056 |
+
--border: rgba(255,255,255,0.06);
|
| 1057 |
+
--border-strong: rgba(255,255,255,0.12);
|
| 1058 |
+
--hairline: rgba(255,255,255,0.03);
|
| 1059 |
+
--fg: #E2E8F0;
|
| 1060 |
+
--fg-muted: #94A3B8;
|
| 1061 |
+
--fg-subtle: #475569;
|
| 1062 |
+
--green: #10B981;
|
| 1063 |
+
--green-bg: rgba(16,185,129,0.10);
|
| 1064 |
+
--green-border: rgba(16,185,129,0.25);
|
| 1065 |
+
--red: #F43F5E;
|
| 1066 |
+
--red-bg: rgba(244,63,94,0.10);
|
| 1067 |
+
--red-border: rgba(244,63,94,0.25);
|
| 1068 |
+
--amber: #F59E0B;
|
|
|
|
|
|
|
| 1069 |
--amber-bg: rgba(245,158,11,0.10);
|
| 1070 |
+
--amber-border: rgba(245,158,11,0.25);
|
| 1071 |
+
--blur: 24px;
|
| 1072 |
--blur-strong: 32px;
|
| 1073 |
+
--shadow-md: 0 1px 0 rgba(255,255,255,0.03) inset,
|
| 1074 |
+
0 8px 24px rgba(0,0,0,0.4),
|
| 1075 |
+
0 1px 2px rgba(0,0,0,0.2);
|
| 1076 |
--radius: 10px;
|
| 1077 |
+
--radius-lg: 14px;
|
| 1078 |
}
|
| 1079 |
|
| 1080 |
body {
|
|
|
|
| 1177 |
position: sticky;
|
| 1178 |
top: 0;
|
| 1179 |
z-index: 100;
|
| 1180 |
+
background: rgba(11, 14, 20, 0.75);
|
| 1181 |
backdrop-filter: blur(var(--blur-strong)) saturate(160%);
|
| 1182 |
-webkit-backdrop-filter: blur(var(--blur-strong)) saturate(160%);
|
| 1183 |
border-bottom: 1px solid var(--hairline);
|
| 1184 |
}
|
| 1185 |
|
| 1186 |
.chex-logo {
|
| 1187 |
+
width: 24px; height: 24px; border-radius: 6px;
|
| 1188 |
+
background: #E2E8F0;
|
| 1189 |
+
color: #0B0E14; display: grid; place-items: center;
|
| 1190 |
font-family: 'JetBrains Mono', monospace; font-weight: 700; font-size: 11px;
|
| 1191 |
letter-spacing: -0.05em;
|
| 1192 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.5);
|
| 1193 |
flex-shrink: 0;
|
| 1194 |
}
|
| 1195 |
|
|
|
|
| 1315 |
textarea:focus, input[type="text"]:focus,
|
| 1316 |
.gradio-container [data-testid="textbox"] textarea:focus,
|
| 1317 |
.gradio-container [data-testid="textbox"] input:focus {
|
| 1318 |
+
border-color: var(--border-strong) !important; background: var(--bg-elev) !important;
|
| 1319 |
+
box-shadow: 0 0 0 2px rgba(255,255,255,0.05) !important; outline: none !important;
|
| 1320 |
}
|
| 1321 |
|
| 1322 |
textarea::placeholder, input::placeholder { color: var(--fg-subtle) !important; }
|
|
|
|
| 1337 |
|
| 1338 |
.gradio-container button.primary, button.primary {
|
| 1339 |
background: var(--fg) !important; color: var(--bg-base) !important; border: 1px solid var(--fg) !important;
|
| 1340 |
+
box-shadow: 0 6px 18px rgba(0,0,0,0.4), 0 1px 0 rgba(255,255,255,0.1) inset !important;
|
| 1341 |
}
|
| 1342 |
+
.gradio-container button.primary:hover, button.primary:hover { opacity: 0.9 !important; box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; }
|
| 1343 |
|
| 1344 |
.gradio-container button.secondary, button.secondary {
|
| 1345 |
+
background: transparent !important; color: var(--fg-muted) !important;
|
| 1346 |
+
border: 1px solid var(--border-strong) !important; box-shadow: none !important;
|
|
|
|
| 1347 |
}
|
| 1348 |
+
.gradio-container button.secondary:hover, button.secondary:hover { background: var(--bg-elev) !important; color: var(--fg) !important; border-color: var(--border-strong) !important; }
|
| 1349 |
|
| 1350 |
button.sm, .gradio-container button[size="sm"], button.small { font-size: 12px !important; padding: 7px 11px !important; }
|
| 1351 |
|
|
|
|
| 1526 |
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="opacity:0.55"><rect x="2" y="5" width="20" height="14" rx="2"/><line x1="2" y1="10" x2="22" y2="10"/></svg>
|
| 1527 |
Bank Statement
|
| 1528 |
</span>
|
| 1529 |
+
<span class="chex-card-kicker">paste · pdf · csv · txt · xlsx · ofx</span>
|
| 1530 |
</div>
|
| 1531 |
"""
|
| 1532 |
|
|
|
|
| 1600 |
with gr.Tabs():
|
| 1601 |
with gr.Tab("Paste text"):
|
| 1602 |
bank_paste_input = gr.Textbox(
|
| 1603 |
+
label="Bank statement text (supports multiple)",
|
| 1604 |
lines=20,
|
| 1605 |
+
placeholder=(
|
| 1606 |
+
"Paste one or more bank statements here.\n\n"
|
| 1607 |
+
"If you paste multiple statements, separate them with a line containing only "
|
| 1608 |
+
"`---` (3+ dashes)."
|
| 1609 |
+
"\n\nOr load the sample below…"
|
| 1610 |
+
),
|
| 1611 |
show_label=False,
|
| 1612 |
)
|
| 1613 |
btn_load_statement = gr.Button("Load sample statement", variant="secondary", size="sm")
|
| 1614 |
with gr.Tab("Upload PDF"):
|
| 1615 |
+
bank_pdf_input = gr.File(
|
| 1616 |
+
label="PDF bank statement (multiple allowed)",
|
| 1617 |
+
file_types=[".pdf"],
|
| 1618 |
+
file_count="multiple",
|
| 1619 |
+
)
|
| 1620 |
+
bank_pdf_password_input = gr.Textbox(
|
| 1621 |
+
label="PDF password (optional)",
|
| 1622 |
+
type="password",
|
| 1623 |
+
placeholder="Leave blank if PDF is not encrypted",
|
| 1624 |
+
show_label=False,
|
| 1625 |
+
)
|
| 1626 |
with gr.Tab("Upload CSV"):
|
| 1627 |
+
bank_csv_input = gr.File(
|
| 1628 |
+
label="CSV bank statement (multiple allowed)",
|
| 1629 |
+
file_types=[".csv"],
|
| 1630 |
+
file_count="multiple",
|
| 1631 |
+
)
|
| 1632 |
+
with gr.Tab("Upload TXT"):
|
| 1633 |
+
bank_txt_input = gr.File(
|
| 1634 |
+
label="TXT bank statement (multiple allowed)",
|
| 1635 |
+
file_types=[".txt", ".text"],
|
| 1636 |
+
file_count="multiple",
|
| 1637 |
+
)
|
| 1638 |
+
with gr.Tab("Upload Excel"):
|
| 1639 |
+
bank_xlsx_input = gr.File(
|
| 1640 |
+
label="Excel bank statement (.xlsx, multiple allowed)",
|
| 1641 |
+
file_types=[".xlsx"],
|
| 1642 |
+
file_count="multiple",
|
| 1643 |
+
)
|
| 1644 |
+
with gr.Tab("Upload OFX / QFX"):
|
| 1645 |
+
bank_ofx_input = gr.File(
|
| 1646 |
+
label="OFX / QFX bank statement (multiple allowed)",
|
| 1647 |
+
file_types=[".ofx", ".qfx"],
|
| 1648 |
+
file_count="multiple",
|
| 1649 |
+
)
|
| 1650 |
|
| 1651 |
with gr.Column(scale=11):
|
| 1652 |
with gr.Group():
|
| 1653 |
gr.HTML(STATEMENT_RESULTS_HEADER_HTML)
|
| 1654 |
analyse_stmt_btn = gr.Button("Analyse statement", variant="primary")
|
| 1655 |
summary_output = gr.Markdown(value="*Run 'Analyse statement' to generate a financial summary.*")
|
| 1656 |
+
with gr.Row():
|
| 1657 |
+
export_csv_btn = gr.Button("Export CSV", variant="secondary", size="sm")
|
| 1658 |
+
export_pdf_btn = gr.Button("Export PDF", variant="secondary", size="sm")
|
| 1659 |
+
export_status = gr.Markdown(value="")
|
| 1660 |
+
export_file = gr.File(label="Download", interactive=False)
|
| 1661 |
gr.HTML('<div class="chex-divider"></div>')
|
| 1662 |
gr.HTML('<span class="chex-section-kicker">Ask a question</span>')
|
| 1663 |
with gr.Row():
|
|
|
|
| 1675 |
bank_reasoning_output = gr.Textbox(label="Reasoning", interactive=False, lines=3)
|
| 1676 |
|
| 1677 |
bank_statement_state = gr.State("")
|
| 1678 |
+
bank_summary_state = gr.State("")
|
| 1679 |
+
# Hidden JSON output for `gradio_client` API usage.
|
| 1680 |
+
bank_api_output = gr.JSON(visible=False)
|
| 1681 |
+
bank_api_question = gr.Textbox(visible=False)
|
| 1682 |
+
bank_api_btn = gr.Button(visible=False)
|
| 1683 |
|
| 1684 |
# ── Tab 03: Benchmark ──────────────────────────────────────────── #
|
| 1685 |
with gr.Tab("03 Benchmark"):
|
|
|
|
| 1716 |
fn=analyze_contract,
|
| 1717 |
inputs=[contract_input, question_input],
|
| 1718 |
outputs=[label_display, answer_output, citation_output, reasoning_output],
|
| 1719 |
+
api_name="contract_analyze",
|
| 1720 |
)
|
| 1721 |
question_input.submit(
|
| 1722 |
fn=analyze_contract,
|
| 1723 |
inputs=[contract_input, question_input],
|
| 1724 |
outputs=[label_display, answer_output, citation_output, reasoning_output],
|
| 1725 |
+
api_name="contract_analyze",
|
| 1726 |
)
|
| 1727 |
|
| 1728 |
btn_load_statement.click(fn=lambda: SAMPLE_STATEMENT, inputs=[], outputs=[bank_paste_input])
|
| 1729 |
|
| 1730 |
analyse_stmt_btn.click(
|
| 1731 |
fn=analyse_bank_statement,
|
| 1732 |
+
inputs=[
|
| 1733 |
+
bank_paste_input,
|
| 1734 |
+
bank_pdf_input,
|
| 1735 |
+
bank_pdf_password_input,
|
| 1736 |
+
bank_csv_input,
|
| 1737 |
+
bank_txt_input,
|
| 1738 |
+
bank_xlsx_input,
|
| 1739 |
+
bank_ofx_input,
|
| 1740 |
+
],
|
| 1741 |
+
outputs=[summary_output, bank_statement_state, bank_summary_state],
|
| 1742 |
+
)
|
| 1743 |
+
|
| 1744 |
+
export_csv_btn.click(
|
| 1745 |
+
fn=export_bank_summary_csv,
|
| 1746 |
+
inputs=[bank_summary_state],
|
| 1747 |
+
outputs=[export_file, export_status],
|
| 1748 |
+
)
|
| 1749 |
+
export_pdf_btn.click(
|
| 1750 |
+
fn=export_bank_summary_pdf,
|
| 1751 |
+
inputs=[bank_summary_state],
|
| 1752 |
+
outputs=[export_file, export_status],
|
| 1753 |
)
|
| 1754 |
|
| 1755 |
bank_ask_btn.click(
|
|
|
|
| 1763 |
outputs=[bank_label_display, bank_answer_output, bank_citation_output, bank_reasoning_output],
|
| 1764 |
)
|
| 1765 |
|
| 1766 |
+
def bank_analyze_api(
|
| 1767 |
+
paste_text: str,
|
| 1768 |
+
pdf_files,
|
| 1769 |
+
pdf_password: str | None,
|
| 1770 |
+
csv_files,
|
| 1771 |
+
txt_files,
|
| 1772 |
+
xlsx_files,
|
| 1773 |
+
ofx_files,
|
| 1774 |
+
question: str | None,
|
| 1775 |
+
) -> dict:
|
| 1776 |
+
summary_md, combined_text, summary_json = analyse_bank_statement(
|
| 1777 |
+
paste_text,
|
| 1778 |
+
pdf_files,
|
| 1779 |
+
pdf_password,
|
| 1780 |
+
csv_files,
|
| 1781 |
+
txt_files,
|
| 1782 |
+
xlsx_files,
|
| 1783 |
+
ofx_files,
|
| 1784 |
+
)
|
| 1785 |
+
|
| 1786 |
+
qa: dict | None = None
|
| 1787 |
+
if (question or "").strip():
|
| 1788 |
+
label_html, answer, citation, reasoning = bank_qa(combined_text, (question or "").strip())
|
| 1789 |
+
qa = {
|
| 1790 |
+
"label_html": label_html,
|
| 1791 |
+
"answer": answer,
|
| 1792 |
+
"citation": citation,
|
| 1793 |
+
"reasoning": reasoning,
|
| 1794 |
+
}
|
| 1795 |
+
|
| 1796 |
+
return {
|
| 1797 |
+
"summary_markdown": summary_md,
|
| 1798 |
+
"combined_text": combined_text,
|
| 1799 |
+
"summary_json": summary_json,
|
| 1800 |
+
"qa": qa,
|
| 1801 |
+
}
|
| 1802 |
+
|
| 1803 |
+
bank_api_btn.click(
|
| 1804 |
+
fn=bank_analyze_api,
|
| 1805 |
+
inputs=[
|
| 1806 |
+
bank_paste_input,
|
| 1807 |
+
bank_pdf_input,
|
| 1808 |
+
bank_pdf_password_input,
|
| 1809 |
+
bank_csv_input,
|
| 1810 |
+
bank_txt_input,
|
| 1811 |
+
bank_xlsx_input,
|
| 1812 |
+
bank_ofx_input,
|
| 1813 |
+
bank_api_question,
|
| 1814 |
+
],
|
| 1815 |
+
outputs=[bank_api_output],
|
| 1816 |
+
api_name="bank_analyze",
|
| 1817 |
+
)
|
| 1818 |
+
|
| 1819 |
|
| 1820 |
if __name__ == "__main__":
|
| 1821 |
demo.launch(show_error=True, theme=gr.themes.Base(), css=CHEX_CSS, ssr_mode=False)
|
requirements.txt
CHANGED
|
@@ -1,10 +1,5 @@
|
|
| 1 |
gradio>=6.0.0
|
| 2 |
-
|
| 3 |
-
peft>=0.12.0
|
| 4 |
-
accelerate>=0.33.0
|
| 5 |
-
bitsandbytes>=0.43.0
|
| 6 |
-
torch>=2.3.0
|
| 7 |
pydantic>=2.0.0
|
| 8 |
pandas>=2.0.0
|
| 9 |
pdfplumber>=0.10.0
|
| 10 |
-
huggingface_hub>=0.24.0
|
|
|
|
| 1 |
gradio>=6.0.0
|
| 2 |
+
huggingface_hub>=0.24.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
pydantic>=2.0.0
|
| 4 |
pandas>=2.0.0
|
| 5 |
pdfplumber>=0.10.0
|
|
|
nda.txt → sample_contracts/nda.txt
RENAMED
|
File without changes
|
service_agreement.txt → sample_contracts/service_agreement.txt
RENAMED
|
File without changes
|
software_license.txt → sample_contracts/software_license.txt
RENAMED
|
File without changes
|
sample_statements/sample_statement.txt
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GREENFIELD BANK — PERSONAL CURRENT ACCOUNT
|
| 2 |
+
Account Holder: Jane A. Smith
|
| 3 |
+
Account Number: 12-34-56 78901234
|
| 4 |
+
Statement Period: 01 April 2025 – 30 April 2025
|
| 5 |
+
|
| 6 |
+
Opening Balance: £1,200.00
|
| 7 |
+
|
| 8 |
+
Date Description Credits Debits Balance
|
| 9 |
+
---------------------------------------------------------------------------
|
| 10 |
+
03 Apr 25 BACS SALARY - ACME SOLUTIONS LTD £2,500.00 £3,700.00
|
| 11 |
+
05 Apr 25 DIRECT DEBIT - NETFLIX.COM £9.99 £3,690.01
|
| 12 |
+
07 Apr 25 CARD PMT - TESCO SUPERSTORE £87.50 £3,602.51
|
| 13 |
+
10 Apr 25 DIRECT DEBIT - VIRGIN GYM £35.00 £3,567.51
|
| 14 |
+
12 Apr 25 CARD PMT - AMAZON UK £142.30 £3,425.21
|
| 15 |
+
15 Apr 25 ATM CASH WITHDRAWAL - HIGH ST £200.00 £3,225.21
|
| 16 |
+
18 Apr 25 CARD PMT - COSTA COFFEE £4.75 £3,220.46
|
| 17 |
+
20 Apr 25 STANDING ORDER - BARCLAYS MORTGAGE £850.00 £2,370.46
|
| 18 |
+
22 Apr 25 DIRECT DEBIT - SPOTIFY £9.99 £2,360.47
|
| 19 |
+
24 Apr 25 CARD PMT - SAINSBURY'S SUPERSTORE £63.20 £2,297.27
|
| 20 |
+
25 Apr 25 BANK TRANSFER IN - J. SMITH £300.00 £2,597.27
|
| 21 |
+
28 Apr 25 CARD PMT - ZARA CLOTHING £55.00 £2,542.27
|
| 22 |
+
29 Apr 25 DIRECT DEBIT - SKY TV £42.00 £2,500.27
|
| 23 |
+
30 Apr 25 ATM CASH WITHDRAWAL - AIRPORT £500.00 £2,000.27
|
| 24 |
+
---------------------------------------------------------------------------
|
| 25 |
+
Closing Balance: £2,000.27
|
| 26 |
+
|
| 27 |
+
Total Credits: £2,800.00
|
| 28 |
+
Total Debits: £1,999.73
|
| 29 |
+
|
| 30 |
+
Suggested questions to try:
|
| 31 |
+
- What was the total salary received this month?
|
| 32 |
+
- Are there any recurring subscription payments?
|
| 33 |
+
- Was there a mortgage payment this month?
|
| 34 |
+
- What was the largest single transaction?
|
| 35 |
+
- Is there a phone bill in this statement?
|