Spaces:

Mead0w1ark
/

MicroHS

Sleeping

App Files Files Community

github-actions[bot] commited on Feb 25

Commit

79f9b3a

0 Parent(s):

Sync from GitHub 38cd8d69dc858672e22cd1448f7768fef87468b1

Browse files

Files changed (25) hide show

.dockerignore +12 -0
.gitattributes +10 -0
.github/workflows/sync_to_hf_space.yml +90 -0
.gitignore +34 -0
Dockerfile +35 -0
LICENSE +21 -0
README.md +158 -0
app.py +754 -0
data/benchmark_cases.csv +79 -0
data/harmonized-system/harmonized-system.csv +0 -0
data/hs_codes_reference.json +3 -0
data/hts/us_hts_lookup.json +3 -0
data/sample_documents/customs_zh.png +0 -0
data/sample_documents/invoice_en.png +0 -0
data/sample_documents/invoice_vi.png +0 -0
data/sample_documents/packing_list_th.png +0 -0
dataset/ATTRIBUTION.md +19 -0
dataset/README.md +66 -0
field_extractor.py +358 -0
hs_dataset.py +341 -0
models/.gitkeep +0 -0
requirements-dev.txt +2 -0
requirements.txt +15 -0
static/.gitkeep +0 -0
templates/index.html +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.git
+.github
+venv
+__pycache__
+*.pyc
+.DS_Store
+uploads
+scripts
+README.md
+LICENSE
+data/training_data.json
+data/sample_documents

.gitattributes ADDED Viewed

	@@ -0,0 +1,10 @@

+data/training_data.json filter=lfs diff=lfs merge=lfs -text
+models/umap_data.json filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+data/training_data.csv filter=lfs diff=lfs merge=lfs -text
+data/training_data_indexed.csv filter=lfs diff=lfs merge=lfs -text
+data/cargo_descriptions.csv filter=lfs diff=lfs merge=lfs -text
+data/hts/us_hts_lookup.json filter=lfs diff=lfs merge=lfs -text
+data/hts/*.csv filter=lfs diff=lfs merge=lfs -text
+data/*.json filter=lfs diff=lfs merge=lfs -text

.github/workflows/sync_to_hf_space.yml ADDED Viewed

	@@ -0,0 +1,90 @@

+name: Sync GitHub to Hugging Face Space
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+jobs:
+  sync-to-hf:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Pull LFS files
+        run: git lfs pull
+      - name: Push to Hugging Face Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          if [ -z "${HF_TOKEN}" ]; then
+            echo "HF_TOKEN secret is not set."
+            exit 1
+          fi
+          DEPLOY_DIR="/tmp/hf-deploy"
+          rm -rf "${DEPLOY_DIR}"
+          mkdir -p "${DEPLOY_DIR}"
+          # Export the current tree (with real LFS file contents, not pointers)
+          tar --exclude=.git -cf - . | (cd "${DEPLOY_DIR}" && tar -xf -)
+          cd "${DEPLOY_DIR}"
+          # Keep GitHub README clean; inject Space front matter only for HF deploy.
+          if [ -f README.md ]; then
+            awk '
+              NR == 1 && $0 == "---" {in_yaml=1; next}
+              in_yaml && $0 == "---" {in_yaml=0; next}
+              !in_yaml {print}
+            ' README.md > README.clean.md
+            printf '%s\n' \
+              '---' \
+              'title: HS Code Classifier Micro' \
+              'emoji: ⚡' \
+              'colorFrom: pink' \
+              'colorTo: blue' \
+              'sdk: docker' \
+              'app_port: 7860' \
+              '---' \
+              > README.frontmatter.md
+            cat README.frontmatter.md README.clean.md > README.md
+            rm -f README.frontmatter.md README.clean.md
+          fi
+          # HF rejects files >10MB without Git LFS.
+          git lfs install
+          git init
+          git checkout -b main
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          # Track large files with LFS for HF (only files still bundled)
+          git lfs track "data/*.json" "models/umap_data.json"
+          git add .gitattributes
+          # Remove files not needed at runtime to stay under HF Space 1GB limit.
+          # Large artifacts (sentence model, embeddings, classifier, training data)
+          # are hosted on HF Hub at $SENTENCE_MODEL_NAME and downloaded at startup.
+          rm -rf scripts/
+          rm -rf models/sentence_model/
+          rm -f models/embeddings.npy models/knn_classifier.pkl models/label_encoder.pkl models/metadata.json models/umap_data.json
+          touch models/.gitkeep
+          rm -f data/training_data.csv data/training_data_indexed.csv
+          rm -f data/hts/hts_*.csv
+          rm -f data/cargo_descriptions.csv
+          rm -f data/training_data.json
+          rm -f data/hf_real_data.csv
+          git add -A
+          git commit -m "Sync from GitHub ${GITHUB_SHA}"
+          git remote add hf "https://oauth2:${HF_TOKEN}@huggingface.co/spaces/Mead0w1ark/MicroHS"
+          git push --force hf main

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+dist/
+build/
+*.egg
+venv/
+.venv/
+env/
+# Sentence model weights (downloaded from HF Hub at startup)
+models/sentence_model/model.safetensors
+models/sentence_model/tokenizer.json
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Jupyter
+.ipynb_checkpoints/
+# Local publish staging
+.hf_dataset_release/
+# Benchmark output
+benchmark_results.json

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+# syntax=docker/dockerfile:1
+FROM python:3.11-slim
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    SENTENCE_MODEL_NAME=intfloat/multilingual-e5-small \
+    HF_ARTIFACT_REPO=Mead0w1ark/multilingual-e5-small-hs-codes
+# System deps for OCR endpoints:
+# - tesseract for image OCR
+# - poppler-utils for pdf2image PDF conversion
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements.txt and install dependencies
+COPY requirements.txt .
+RUN pip install --upgrade pip && pip install -r requirements.txt
+# Copy only runtime files (keeps build context and cache churn smaller)
+COPY app.py field_extractor.py hs_dataset.py ./
+COPY templates ./templates
+COPY static ./static
+COPY data ./data
+COPY models ./models
+RUN mkdir -p uploads
+# Expose the port FastAPI will run on
+EXPOSE 7860
+# Command to run the FastAPI application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 James Ball
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,158 @@

+---
+title: HS Code Classifier Micro
+emoji: ⚡
+colorFrom: pink
+colorTo: blue
+sdk: docker
+app_port: 7860
+---
+# HSClassify_micro 🔍
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+**Machine learning model for multilingual HS/HTS classification** for trade finance and customs workflows, built with FastAPI + OCR.
+Classifies product descriptions into [Harmonized System (HS) codes](https://en.wikipedia.org/wiki/Harmonized_System) using sentence embeddings and k-NN search, with an interactive latent space visualization.
+## Live Demo
+- Hugging Face Space: [https://huggingface.co/spaces/Troglobyte/MicroHS/](https://huggingface.co/spaces/Mead0w1ark/MicroHS)
+## Features
+- 🌍 **Multilingual** — example supports English, Thai, Vietnamese, and Chinese product descriptions
+- ⚡ **Real-time classification** — top-3 HS code predictions with confidence scores
+- 📊 **Latent space visualization** — interactive UMAP plot showing embedding clusters
+- 🎯 **KNN-based** — simple, interpretable nearest-neighbor approach using fine-tuned `multilingual-e5-small`
+- 🧾 **Official HS coverage** — training generation incorporates the [datasets/harmonized-system](https://github.com/datasets/harmonized-system) 6-digit nomenclature
+## Dataset Attribution
+This project includes HS nomenclature content sourced from:
+- [datasets/harmonized-system](https://github.com/datasets/harmonized-system)
+- Upstream references listed by that dataset:
+  - WCO HS nomenclature documentation
+  - UN Comtrade data extraction API
+Related datasets (evaluated during development):
+- [Customs-Declaration-Datasets](https://github.com/Seondong/Customs-Declaration-Datasets) — 54,000 synthetic customs declaration records derived from 24.7M real Korean customs entries. Provides structured trade metadata (HS codes, country of origin, price, weight, fraud labels) but does not include free-text product descriptions. Cited as a reference for customs data research. See: *S. Kim et al., "DATE: Dual Attentive Tree-aware Embedding for Customs Fraud Detection," KDD 2020.*
+Licensing:
+- Upstream HS source data: **ODC Public Domain Dedication and License (PDDL) v1.0**
+- Project-added synthetic multilingual examples and labels: **MIT** (this repo)
+## Quick Start
+```bash
+# Clone
+git clone https://github.com/JamesEBall/HSClassify_micro.git
+cd HSClassify_micro
+# Install dependencies
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+# Generate training data & train model
+python scripts/generate_training_data.py
+python scripts/train_model.py
+# Run the web app
+uvicorn app:app --reload --port 8000
+```
+Open [http://localhost:8000](http://localhost:8000) to classify products.
+## Deployment
+- The Space runs in Docker (`sdk: docker`, `app_port: 7860`).
+- OCR endpoints require OS packages; `Dockerfile` installs:
+  - `tesseract-ocr`
+  - `poppler-utils` (for PDF conversion via `pdf2image`)
+- Model and data loading is resilient in hosted environments:
+  - Large artifacts (model weights, embeddings, classifier, training data) are hosted on [HF Hub](https://huggingface.co/Mead0w1ark/multilingual-e5-small-hs-codes) and downloaded automatically at startup if not present locally
+  - Set `SENTENCE_MODEL_NAME` to override the HF model repo (default: `Mead0w1ark/multilingual-e5-small-hs-codes`)
+### Auto Sync (GitHub -> Hugging Face Space)
+This repo includes a GitHub Action at `.github/workflows/sync_to_hf_space.yml` that syncs `main` to:
+- `spaces/Troglobyte/MicroHS`
+Required GitHub secret:
+- `HF_TOKEN`: Hugging Face token with write access to the Space
+## Publish Dataset to Hugging Face Datasets
+Use the included publish helper:
+```bash
+bash scripts/publish_dataset_to_hf.sh <namespace>/<dataset-repo>
+# Example:
+bash scripts/publish_dataset_to_hf.sh Troglobyte/hsclassify-micro-dataset
+```
+The script creates/updates a Dataset repo and uploads:
+- `training_data_indexed.csv`
+- `harmonized-system.csv` (attributed source snapshot)
+- `hs_codes_reference.json`
+- Dataset card + attribution notes
+## Model
+The classifier uses [`multilingual-e5-small`](https://huggingface.co/intfloat/multilingual-e5-small) fine-tuned with contrastive learning (MultipleNegativesRankingLoss) on 9,829 curated HS-coded training pairs. Fine-tuned weights are hosted on HF Hub at [`Mead0w1ark/multilingual-e5-small-hs-codes`](https://huggingface.co/Mead0w1ark/multilingual-e5-small-hs-codes).
+| Metric | Before Fine-Tuning | After Fine-Tuning |
+|---|---|---|
+| Training accuracy (80/20 split) | 77.2% | **87.0%** |
+| Benchmark Top-1 (in-label-space) | 88.6% | **92.9%** |
+| Benchmark Top-3 (in-label-space) | — | **97.1%** |
+To fine-tune from scratch:
+```bash
+python scripts/train_model.py --finetune
+```
+## How It Works
+1. **Embedding**: Product descriptions are encoded using fine-tuned `multilingual-e5-small` (384-dim sentence embeddings)
+2. **Classification**: K-nearest neighbors (k=5) over pre-computed embeddings of HS-coded training examples
+3. **Visualization**: UMAP reduction to 2D for interactive cluster exploration via Plotly
+## Project Structure
+```
+├── app.py                  # FastAPI web application
+├── dataset/
+│   ├── README.md           # HF dataset card (attribution + schema)
+│   └── ATTRIBUTION.md      # Source and license attribution details
+├── requirements.txt        # Python dependencies
+├── scripts/
+│   ├── generate_training_data.py   # Synthetic training data generator
+│   ├── train_model.py              # Model training (embeddings + KNN)
+│   └── publish_dataset_to_hf.sh    # Publish dataset artifacts to HF Datasets
+├── data/
+│   ├── hs_codes_reference.json     # HS code definitions
+│   ├── harmonized-system/harmonized-system.csv  # Upstream HS source snapshot
+│   ├── training_data.csv           # Generated training examples
+│   └── training_data_indexed.csv   # App/latent-ready training examples
+├── models/                 # Trained artifacts (generated)
+│   ├── sentence_model/     # Cached sentence transformer
+│   ├── embeddings.npy      # Pre-computed embeddings
+│   ├── knn_classifier.pkl  # Trained KNN model
+│   └── label_encoder.pkl   # Label encoder
+└── templates/
+    └── index.html          # Web UI
+```
+## Context
+Built as a rapid POC exploring whether multilingual sentence embeddings can simplify HS code classification for customs authorities.
+## License
+MIT — see [LICENSE](LICENSE)

app.py ADDED Viewed

	@@ -0,0 +1,754 @@

+"""
+HS Code Classifier Web App
+FastAPI backend with:
+- Real-time HS code prediction from text input
+- Document upload with OCR (Tesseract) support
+- Structured field extraction from trade documents
+- HS (6-digit) and HTS (7-10 digit) code support
+- Top-5 suggestions with confidence scores
+- Latent space visualization with UMAP
+- Multilingual support (EN, TH, VI, ZH)
+"""
+import json
+import os
+import re
+import shutil
+import tempfile
+import threading
+import time
+import pickle
+import uuid
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from fastapi import FastAPI, Request, UploadFile, File, Form
+from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from sentence_transformers import SentenceTransformer
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import LabelEncoder
+from field_extractor import extract_fields, get_all_countries, get_all_currencies
+from hs_dataset import get_dataset, get_hts_extensions, get_available_hts_countries
+# Paths
+PROJECT_DIR = Path(__file__).parent
+MODEL_DIR = PROJECT_DIR / "models"
+DATA_DIR = PROJECT_DIR / "data"
+UPLOAD_DIR = PROJECT_DIR / "uploads"
+UPLOAD_DIR.mkdir(exist_ok=True)
+# Upload config
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+ALLOWED_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".pdf"}
+# Initialize FastAPI
+from starlette.middleware.gzip import GZipMiddleware
+app = FastAPI(title="HS Code Classifier", version="2.0.0")
+app.add_middleware(GZipMiddleware, minimum_size=1000)
+app.mount("/static", StaticFiles(directory=str(PROJECT_DIR / "static")), name="static")
+templates = Jinja2Templates(directory=str(PROJECT_DIR / "templates"))
+# Global model state
+model = None
+classifier = None
+label_encoder = None
+hs_reference = None
+training_data = None
+embeddings = None
+umap_data = None
+umap_ready = False
+hs_dataset = None
+classifier_training_indices = None
+def _download_hf_artifacts():
+    """Download large artifacts from HF Hub if not present locally."""
+    from huggingface_hub import hf_hub_download
+    repo_id = os.getenv("HF_ARTIFACT_REPO", "Mead0w1ark/multilingual-e5-small-hs-codes")
+    file_map = {
+        MODEL_DIR / "embeddings.npy": "embeddings.npy",
+        MODEL_DIR / "knn_classifier.pkl": "knn_classifier.pkl",
+        MODEL_DIR / "label_encoder.pkl": "label_encoder.pkl",
+        MODEL_DIR / "metadata.json": "metadata.json",
+        MODEL_DIR / "umap_data.json": "umap_data.json",
+        DATA_DIR / "training_data.csv": "training_data.csv",
+    }
+    for local_path, repo_filename in file_map.items():
+        if not local_path.exists():
+            print(f"Downloading {repo_filename} from {repo_id}...")
+            try:
+                downloaded = hf_hub_download(
+                    repo_id=repo_id, filename=repo_filename,
+                )
+                local_path.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy2(downloaded, local_path)
+                print(f"  -> {local_path}")
+            except Exception as e:
+                print(f"  Warning: could not download {repo_filename}: {e}")
+def load_models():
+    """Load all model artifacts on startup."""
+    global model, classifier, label_encoder, hs_reference, training_data, embeddings, umap_data, hs_dataset, classifier_training_indices
+    print("Loading models...")
+    start = time.time()
+    # Download large artifacts from HF Hub if missing locally.
+    _download_hf_artifacts()
+    # Load sentence transformer:
+    # prefer local bundled model, fall back to Hub model when large files are not in repo.
+    local_model_dir = MODEL_DIR / "sentence_model"
+    has_local_weights = (
+        (local_model_dir / "model.safetensors").exists()
+        or (local_model_dir / "pytorch_model.bin").exists()
+    )
+    has_local_tokenizer = (local_model_dir / "tokenizer.json").exists()
+    if local_model_dir.exists() and has_local_weights and has_local_tokenizer:
+        model = SentenceTransformer(str(local_model_dir))
+        print("Loaded local sentence model from models/sentence_model")
+    else:
+        fallback_model = os.getenv(
+            "SENTENCE_MODEL_NAME",
+            "intfloat/multilingual-e5-small",
+        )
+        model = SentenceTransformer(fallback_model)
+        print(f"Loaded sentence model from Hugging Face Hub: {fallback_model}")
+    # Load HS code reference
+    with open(DATA_DIR / "hs_codes_reference.json") as f:
+        hs_reference = json.load(f)
+    # Load training data
+    training_data_path = DATA_DIR / "training_data_indexed.csv"
+    if not training_data_path.exists():
+        training_data_path = DATA_DIR / "training_data.csv"
+    training_data = pd.read_csv(training_data_path)
+    training_data["hs_code"] = training_data["hs_code"].astype(str).str.zfill(6)
+    classifier_path = MODEL_DIR / "knn_classifier.pkl"
+    label_encoder_path = MODEL_DIR / "label_encoder.pkl"
+    embeddings_path = MODEL_DIR / "embeddings.npy"
+    embeddings_part_paths = sorted(MODEL_DIR.glob("embeddings_part*.npy"))
+    core_codes = {str(k).zfill(6) for k in hs_reference.keys()}
+    artifacts_exist = (
+        classifier_path.exists()
+        and label_encoder_path.exists()
+        and (embeddings_path.exists() or len(embeddings_part_paths) > 0)
+    )
+    def load_cached_embeddings():
+        if embeddings_path.exists():
+            return np.load(embeddings_path)
+        part_paths = sorted(MODEL_DIR.glob("embeddings_part*.npy"))
+        if part_paths:
+            parts = [np.load(p) for p in part_paths]
+            return np.concatenate(parts, axis=0)
+        return None
+    def compute_full_embeddings():
+        texts = training_data["text"].fillna("").astype(str).tolist()
+        if not texts:
+            raise RuntimeError("No training rows available to rebuild classifier.")
+        return model.encode(
+            [f"passage: {text}" for text in texts],
+            normalize_embeddings=True,
+            convert_to_numpy=True,
+        )
+    def rebuild_classifier_on_curated_codes():
+        global classifier, label_encoder, classifier_training_indices
+        classifier_df = training_data[training_data["hs_code"].isin(core_codes)].copy()
+        if classifier_df.empty:
+            classifier_df = training_data
+        clf_indices = classifier_df.index.to_numpy()
+        clf_embeddings = embeddings[clf_indices]
+        hs_labels = classifier_df["hs_code"].tolist()
+        label_encoder = LabelEncoder()
+        y = label_encoder.fit_transform(hs_labels)
+        classifier = KNeighborsClassifier(
+            n_neighbors=min(5, len(classifier_df)),
+            metric="cosine",
+            weights="distance",
+        )
+        classifier.fit(clf_embeddings, y)
+        classifier_training_indices = clf_indices
+        print(
+            f"Rebuilt classifier on {len(classifier_df)} rows "
+            f"across {len(set(hs_labels))} curated HS codes"
+        )
+        try:
+            np.save(embeddings_path, embeddings)
+            with open(classifier_path, "wb") as f:
+                pickle.dump(classifier, f)
+            with open(label_encoder_path, "wb") as f:
+                pickle.dump(label_encoder, f)
+            print("Saved rebuilt classifier artifacts to models/")
+        except Exception as e:
+            print(f"Warning: could not cache rebuilt artifacts: {e}")
+    if artifacts_exist:
+        with open(classifier_path, "rb") as f:
+            classifier = pickle.load(f)
+        with open(label_encoder_path, "rb") as f:
+            label_encoder = pickle.load(f)
+        embeddings = load_cached_embeddings()
+        print("Loaded classifier artifacts from models/")
+        if embeddings is None or len(embeddings) != len(training_data):
+            print(
+                f"Embeddings size mismatch (embeddings={len(embeddings) if embeddings is not None else 0}, "
+                f"data={len(training_data)}). "
+                "Recomputing embeddings..."
+            )
+            embeddings = compute_full_embeddings()
+        artifact_codes = {str(c).zfill(6) for c in getattr(label_encoder, "classes_", [])}
+        invalid_artifacts = (
+            not artifact_codes
+            or not artifact_codes.issubset(core_codes)
+            or len(artifact_codes) > len(core_codes)
+        )
+        if invalid_artifacts:
+            print("Classifier artifacts not aligned with curated HS set; rebuilding classifier...")
+            rebuild_classifier_on_curated_codes()
+        else:
+            # Map KNN fit row indices back to full training_data row indices for latent neighbors.
+            classifier_df = training_data[training_data["hs_code"].isin(artifact_codes)].copy()
+            classifier_training_indices = classifier_df.index.to_numpy()
+            n_fit = int(getattr(classifier, "n_samples_fit_", 0))
+            if n_fit <= 0:
+                fit_x = getattr(classifier, "_fit_X", None)
+                n_fit = int(fit_x.shape[0]) if fit_x is not None else 0
+            if n_fit > 0 and len(classifier_training_indices) == n_fit:
+                print(f"Mapped classifier indices to {len(classifier_training_indices)} training rows")
+            else:
+                print(
+                    "Classifier index mapping mismatch "
+                    f"(mapped={len(classifier_training_indices)}, fit={n_fit}); rebuilding classifier..."
+                )
+                rebuild_classifier_on_curated_codes()
+    else:
+        print("Classifier artifacts missing; rebuilding from training data...")
+        embeddings = compute_full_embeddings()
+        rebuild_classifier_on_curated_codes()
+    # Load HS dataset (official harmonized-system data)
+    hs_dataset = get_dataset()
+    # UMAP data is loaded/computed in a background thread so the server
+    # can start immediately and pass the HF Space health check.
+    umap_data = []
+    elapsed = time.time() - start
+    print(f"All models loaded in {elapsed:.1f}s")
+def _compute_umap_background():
+    """Load UMAP data from cache or compute in background.
+    Sets the global ``umap_data`` list and ``umap_ready`` flag when done.
+    """
+    global umap_data, umap_ready
+    cache_path = MODEL_DIR / "umap_data.json"
+    if cache_path.exists():
+        try:
+            with open(cache_path, encoding="utf-8") as f:
+                cached = json.load(f)
+            has_category_fields = (
+                isinstance(cached, list)
+                and len(cached) > 0
+                and "chapter_name" in cached[0]
+            )
+            if isinstance(cached, list) and len(cached) == len(training_data) and has_category_fields:
+                umap_data = cached
+                umap_ready = True
+                print(f"Loaded cached UMAP data: {len(umap_data)} points")
+                return
+            else:
+                print(
+                    f"Cached UMAP size mismatch (cache={len(cached)}, data={len(training_data)}). "
+                    "Recomputing UMAP projection..."
+                )
+        except Exception as e:
+            print(f"Warning: could not read UMAP cache: {e}")
+    print("Computing UMAP projection (background)...")
+    try:
+        import umap
+        reducer = umap.UMAP(
+            n_neighbors=30,
+            min_dist=0.0,
+            n_components=2,
+            metric='cosine',
+            random_state=42,
+        )
+        umap_coords = reducer.fit_transform(embeddings)
+        points = []
+        for i, row in training_data.iterrows():
+            hs_code = str(row["hs_code"]).zfill(6)
+            chapter = row["hs_chapter"]
+            chapter_name = str(row.get("hs_chapter_name", "")).strip()
+            if not chapter_name or re.match(r"^HS\s\d{2}$", chapter_name):
+                chapter_name = str(chapter).split(";")[0].strip()
+            desc = hs_reference.get(hs_code, {}).get("desc", "Unknown")
+            points.append({
+                "x": float(umap_coords[i, 0]),
+                "y": float(umap_coords[i, 1]),
+                "text": row["text"][:80],
+                "hs_code": hs_code,
+                "chapter": chapter,
+                "chapter_name": chapter_name,
+                "hs_desc": desc,
+                "language": row["language"],
+            })
+        with open(cache_path, "w", encoding="utf-8") as f:
+            json.dump(points, f, ensure_ascii=False)
+        umap_data = points
+        umap_ready = True
+        print(f"UMAP projection computed for {len(umap_data)} points")
+    except Exception as e:
+        print(f"UMAP computation failed: {e}")
+        umap_ready = True  # mark ready so endpoints stop saying "computing"
+@app.on_event("startup")
+async def startup():
+    load_models()
+    threading.Thread(target=_compute_umap_background, daemon=True).start()
+@app.get("/", response_class=HTMLResponse)
+async def index(request: Request):
+    """Main page."""
+    metadata = {}
+    try:
+        with open(MODEL_DIR / "metadata.json") as f:
+            metadata = json.load(f)
+    except:
+        pass
+    countries = get_all_countries()
+    currencies = get_all_currencies()
+    hts_countries = get_available_hts_countries()
+    return templates.TemplateResponse("index.html", {
+        "request": request,
+        "metadata": metadata,
+        "countries": countries,
+        "currencies": currencies,
+        "hts_countries": hts_countries,
+    })
+@app.post("/predict")
+async def predict(request: Request):
+    """Predict HS code for a product description with optional structured context."""
+    body = await request.json()
+    query_text = body.get("text", "").strip()
+    made_in = body.get("made_in", "")
+    ship_to = body.get("ship_to", "")
+    item_price = body.get("item_price", None)
+    currency = body.get("currency", "")
+    if not query_text:
+        return JSONResponse({"error": "No text provided"}, status_code=400)
+    start = time.time()
+    # Build enriched query using structured fields
+    enriched_query = query_text
+    context_parts = []
+    if made_in:
+        context_parts.append(f"origin: {made_in}")
+    if ship_to:
+        context_parts.append(f"destination: {ship_to}")
+    if item_price and currency:
+        context_parts.append(f"value: {currency} {item_price}")
+    if context_parts:
+        enriched_query = f"{query_text} ({', '.join(context_parts)})"
+    # Encode query with e5 prefix
+    query_emb = model.encode(
+        [f"query: {enriched_query}"],
+        normalize_embeddings=True,
+        convert_to_numpy=True
+    )
+    # Get predictions with probabilities
+    probs = classifier.predict_proba(query_emb)[0]
+    top_k = 5
+    top_indices = np.argsort(probs)[-top_k:][::-1]
+    predictions = []
+    for idx in top_indices:
+        hs_code = label_encoder.classes_[idx]
+        hs_code_padded = str(hs_code).zfill(6)
+        confidence = float(probs[idx])
+        if confidence < 0.01:
+            continue
+        info = hs_reference.get(hs_code_padded, {})
+        chapter_code = hs_code_padded[:2]
+        heading_code = hs_code_padded[:4]
+        # Get official description from HS dataset if available
+        official = hs_dataset.lookup(hs_code_padded) if hs_dataset else None
+        official_desc = official['description'] if official else None
+        # Validate against official dataset
+        validation = hs_dataset.validate_hs_code(hs_code_padded) if hs_dataset else None
+        predictions.append({
+            "hs_code": hs_code_padded,
+            "confidence": confidence,
+            "description": info.get("desc", official_desc or "No description available"),
+            "official_description": official_desc,
+            "chapter": info.get("chapter", "Unknown"),
+            "chapter_code": chapter_code,
+            "heading_code": heading_code,
+            "validated": validation['valid'] if validation else None,
+        })
+    # Find nearest training examples
+    sims = embeddings @ query_emb.T
+    top_sim_idx = np.argsort(sims.flatten())[-3:][::-1]
+    similar_examples = []
+    for idx in top_sim_idx:
+        if idx < len(training_data):
+            similar_examples.append({
+                "text": training_data.iloc[idx]["text"],
+                "hs_code": str(training_data.iloc[idx]["hs_code"]).zfill(6),
+                "similarity": float(sims[idx][0]),
+            })
+    elapsed = time.time() - start
+    return JSONResponse({
+        "query": query_text,
+        "enriched_query": enriched_query,
+        "predictions": predictions,
+        "similar_examples": similar_examples,
+        "inference_time_ms": round(elapsed * 1000, 1),
+    })
+@app.post("/upload-document")
+async def upload_document(file: UploadFile = File(...)):
+    """Upload a document (image/PDF) and extract text via OCR + structured fields."""
+    # Validate file
+    if not file.filename:
+        return JSONResponse({"error": "No file provided"}, status_code=400)
+    ext = Path(file.filename).suffix.lower()
+    if ext not in ALLOWED_EXTENSIONS:
+        return JSONResponse(
+            {"error": f"Unsupported file type: {ext}. Allowed: {', '.join(ALLOWED_EXTENSIONS)}"},
+            status_code=400
+        )
+    # Read file content
+    content = await file.read()
+    if len(content) > MAX_FILE_SIZE:
+        return JSONResponse(
+            {"error": f"File too large. Maximum: {MAX_FILE_SIZE // (1024*1024)}MB"},
+            status_code=400
+        )
+    # Save to temp file
+    file_id = str(uuid.uuid4())[:8]
+    temp_path = UPLOAD_DIR / f"{file_id}{ext}"
+    with open(temp_path, "wb") as f:
+        f.write(content)
+    try:
+        import pytesseract
+        from PIL import Image
+        ocr_text = ""
+        if ext == ".pdf":
+            # Convert PDF to images, then OCR
+            try:
+                from pdf2image import convert_from_path
+                images = convert_from_path(str(temp_path), dpi=300)
+                texts = []
+                for img in images:
+                    texts.append(pytesseract.image_to_string(img))
+                ocr_text = "\n\n".join(texts)
+            except ImportError:
+                return JSONResponse(
+                    {"error": "PDF support requires pdf2image and poppler. Install with: pip install pdf2image"},
+                    status_code=500
+                )
+            except Exception as e:
+                return JSONResponse(
+                    {"error": f"PDF processing error: {str(e)}"},
+                    status_code=500
+                )
+        else:
+            # Image OCR
+            img = Image.open(temp_path)
+            ocr_text = pytesseract.image_to_string(img)
+        if not ocr_text.strip():
+            return JSONResponse({
+                "error": "OCR could not extract any text from this document. Please try a clearer image.",
+                "raw_text": "",
+                "fields": {},
+            })
+        # Extract structured fields
+        fields = extract_fields(ocr_text)
+        return JSONResponse({
+            "success": True,
+            "file_id": file_id,
+            "filename": file.filename,
+            "raw_text": ocr_text.strip(),
+            "fields": fields,
+        })
+    except Exception as e:
+        return JSONResponse(
+            {"error": f"OCR processing failed: {str(e)}"},
+            status_code=500
+        )
+    finally:
+        # Clean up temp file
+        if temp_path.exists():
+            temp_path.unlink()
+@app.post("/extract-fields")
+async def extract_fields_endpoint(request: Request):
+    """Extract structured fields from arbitrary text (no OCR needed)."""
+    body = await request.json()
+    text = body.get("text", "").strip()
+    if not text:
+        return JSONResponse({"error": "No text provided"}, status_code=400)
+    fields = extract_fields(text)
+    return JSONResponse({"fields": fields})
+@app.get("/hts-extensions/{hs_code}")
+async def get_hts(hs_code: str, country: str = "US"):
+    """Get HTS (country-specific) extensions for a 6-digit HS code."""
+    result = get_hts_extensions(hs_code, country)
+    return JSONResponse(result)
+@app.get("/hs-lookup/{hs_code}")
+async def hs_lookup(hs_code: str):
+    """Look up an HS code in the official dataset."""
+    if not hs_dataset:
+        return JSONResponse({"error": "HS dataset not loaded"}, status_code=500)
+    result = hs_dataset.lookup(hs_code)
+    if not result:
+        # Try search instead
+        search_results = hs_dataset.search(hs_code, max_results=5)
+        return JSONResponse({
+            "found": False,
+            "message": f"Code {hs_code} not found. Did you mean one of these?",
+            "suggestions": search_results,
+        })
+    return JSONResponse({"found": True, **result})
+@app.get("/hs-search")
+async def hs_search(q: str = "", limit: int = 20):
+    """Search HS codes by description."""
+    if not q:
+        return JSONResponse({"error": "No query provided"}, status_code=400)
+    results = hs_dataset.search(q, max_results=limit)
+    return JSONResponse({"results": results, "query": q})
+@app.get("/hs-validate/{hs_code}")
+async def hs_validate(hs_code: str):
+    """Validate whether an HS code exists."""
+    result = hs_dataset.validate_hs_code(hs_code)
+    return JSONResponse(result)
+@app.get("/hts-countries")
+async def hts_countries():
+    """Get list of countries with HTS extensions available."""
+    return JSONResponse({"countries": get_available_hts_countries()})
+@app.get("/visualization-data")
+async def get_visualization_data(request: Request):
+    """Return UMAP projection data for visualization.
+    Supports ``?max_points=N`` to subsample for faster initial load.
+    The subsample is stratified by chapter so every category is represented.
+    """
+    max_points = int(request.query_params.get("max_points", "0"))
+    points = umap_data
+    if not points:
+        cache_path = MODEL_DIR / "umap_data.json"
+        if cache_path.exists():
+            with open(cache_path, encoding="utf-8") as f:
+                points = json.load(f)
+    if not points:
+        if not umap_ready:
+            return JSONResponse({"points": [], "computing": True})
+        return JSONResponse({"points": [], "error": "No UMAP data available"})
+    total = len(points)
+    if 0 < max_points < total:
+        # Stratified subsample: keep proportional representation per chapter
+        import random as _rng
+        _rng.seed(42)
+        by_chapter: dict[str, list] = {}
+        for p in points:
+            by_chapter.setdefault(p.get("chapter_name", "Other"), []).append(p)
+        sampled: list = []
+        for ch, ch_pts in by_chapter.items():
+            n = max(1, round(len(ch_pts) / total * max_points))
+            sampled.extend(_rng.sample(ch_pts, min(n, len(ch_pts))))
+        _rng.shuffle(sampled)
+        return JSONResponse({"points": sampled, "total": total, "sampled": True})
+    return JSONResponse({"points": points, "total": total})
+@app.get("/visualization-density")
+async def get_visualization_density():
+    """All UMAP points in compact columnar format for density/labels."""
+    points = umap_data or []
+    if not points:
+        cache_path = MODEL_DIR / "umap_data.json"
+        if cache_path.exists():
+            with open(cache_path, encoding="utf-8") as f:
+                points = json.load(f)
+    if not points:
+        if not umap_ready:
+            return JSONResponse({"chapters": {}, "computing": True})
+        return JSONResponse({"error": "No data"})
+    by_chapter: dict[str, dict[str, list]] = {}
+    for p in points:
+        ch = p.get("chapter_name", "Unknown")
+        if ch not in by_chapter:
+            by_chapter[ch] = {"x": [], "y": []}
+        by_chapter[ch]["x"].append(round(p["x"], 3))
+        by_chapter[ch]["y"].append(round(p["y"], 3))
+    return JSONResponse({"chapters": by_chapter})
+@app.post("/embed-query")
+async def embed_query(request: Request):
+    """Get UMAP coordinates for a query."""
+    body = await request.json()
+    query_text = body.get("text", "").strip()
+    if not query_text:
+        return JSONResponse({"error": "No text provided"}, status_code=400)
+    query_emb = model.encode(
+        [f"query: {query_text}"],
+        normalize_embeddings=True,
+        convert_to_numpy=True
+    )
+    n_fit = int(getattr(classifier, "n_samples_fit_", 0))
+    if n_fit <= 0:
+        fit_x = getattr(classifier, "_fit_X", None)
+        n_fit = int(fit_x.shape[0]) if fit_x is not None else 0
+    if n_fit <= 0:
+        return JSONResponse({"error": "Classifier has no fitted rows"}, status_code=500)
+    n_neighbors = min(5, n_fit)
+    distances, indices = classifier.kneighbors(query_emb, n_neighbors=n_neighbors)
+    if umap_data and len(umap_data) > 0:
+        weights = 1.0 / (distances[0] + 1e-6)
+        weights = weights / weights.sum()
+        mapped_indices = []
+        for idx in indices[0]:
+            mapped_idx = int(idx)
+            if (
+                classifier_training_indices is not None
+                and mapped_idx < len(classifier_training_indices)
+            ):
+                mapped_idx = int(classifier_training_indices[mapped_idx])
+            mapped_indices.append(mapped_idx)
+        x = sum(
+            umap_data[idx]["x"] * w
+            for idx, w in zip(mapped_indices, weights)
+            if 0 <= idx < len(umap_data)
+        )
+        y = sum(
+            umap_data[idx]["y"] * w
+            for idx, w in zip(mapped_indices, weights)
+            if 0 <= idx < len(umap_data)
+        )
+        neighbors = []
+        for idx, dist in zip(mapped_indices, distances[0]):
+            if idx < len(umap_data):
+                point = umap_data[idx]
+                # cosine distance in [0, 2] for normalized vectors; lower is closer
+                similarity = max(0.0, min(1.0, 1.0 - float(dist)))
+                neighbors.append({
+                    **point,
+                    "distance": float(dist),
+                    "similarity": similarity,
+                })
+        return JSONResponse({
+            "x": float(x),
+            "y": float(y),
+            "neighbors": neighbors,
+        })
+    if not umap_ready:
+        return JSONResponse({"error": "UMAP data is still computing", "computing": True})
+    return JSONResponse({"error": "No UMAP data for projection"})
+@app.get("/health")
+async def health():
+    """Health check."""
+    return {
+        "status": "ok",
+        "model_loaded": model is not None,
+        "hs_dataset_loaded": hs_dataset._loaded if hs_dataset else False,
+        "hs_codes_count": len(hs_dataset.subheadings) if hs_dataset else 0,
+        "umap_ready": umap_ready,
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

data/benchmark_cases.csv ADDED Viewed

	@@ -0,0 +1,79 @@

+text,expected_hs_code,category,language,notes
+fresh boneless beef,020130,easy,en,common meat product
+frozen boneless bovine meat for export,020230,easy,en,frozen meat variant
+frozen shrimp 500g bag,030617,easy,en,common seafood
+whole milk 3.5% fat,040120,easy,en,standard dairy
+cheddar cheese block,040690,easy,en,common cheese
+fresh tomatoes,070200,easy,en,basic vegetable
+fresh red apples,080810,easy,en,common fruit
+bananas fresh,080300,easy,en,top traded fruit
+raw coffee beans unroasted,090111,easy,en,major commodity
+white rice 25kg bag,100630,easy,en,staple grain
+palm oil refined,151190,easy,en,major edible oil
+cane sugar raw,170199,easy,en,basic commodity
+sweet biscuits assorted,190531,easy,en,packaged food
+bottled sparkling water flavored,220210,easy,en,common beverage
+beer lager 330ml bottles,220300,easy,en,common alcohol
+scotch whisky 700ml,220830,easy,en,spirits
+crude petroleum oil,270900,easy,en,major commodity
+polyethylene pellets LDPE,390110,easy,en,common plastic resin
+car tyre 205/55R16 new,401110,easy,en,auto consumable
+cotton t-shirts mens,610910,easy,en,basic garment
+hot rolled steel coil 600mm,720839,easy,en,industrial steel
+copper cathodes 99.99% purity,740311,easy,en,refined metal
+laptop computer 14 inch,847130,easy,en,common electronics
+smartphone Samsung Galaxy,851712,easy,en,ubiquitous device
+lithium ion battery pack 48V,850760,easy,en,EV battery
+sedan car 2000cc petrol engine,870323,easy,en,standard vehicle
+wooden bedroom wardrobe,940350,easy,en,common furniture
+tea,090210,edge_case,en,very short query - ambiguous
+car parts,870899,edge_case,en,vague automotive
+medicine,300490,edge_case,en,extremely generic
+chips,854231,edge_case,en,ambiguous - food or electronics
+oil,270900,edge_case,en,highly ambiguous
+shoes,640399,edge_case,en,generic footwear
+paper,480256,edge_case,en,very generic
+plastic bags for groceries,392321,edge_case,en,everyday item
+Galaxy S24 Ultra,851712,edge_case,en,brand name only
+Nespresso coffee capsules,210111,edge_case,en,branded coffee product
+Goodyear truck tyre 315/80R22.5,401120,edge_case,en,brand + specs
+Jack Daniels Tennessee whiskey 750ml,220830,edge_case,en,brand name spirits
+Nintendo Switch gaming console,950490,edge_case,en,brand - games vs electronics
+surgical masks disposable,901890,edge_case,en,medical supply
+USB-C charging cable,854239,edge_case,en,tech accessory
+frozen cod fish fillet,030389,edge_case,en,specific fish species
+stainless steel bolts M10,730890,edge_case,en,metal hardware
+yoga pants women polyester,620462,edge_case,en,modern clothing description
+aspirin tablets 500mg retail,300490,edge_case,en,OTC pharma
+insecticide spray for mosquitoes,380891,edge_case,en,household chemical
+PET bottles preform,390760,edge_case,en,industrial plastic
+ข้าวหอมมะลิ,100630,multilingual,th,jasmine rice
+กุ้งแช่แข็ง,030617,multilingual,th,frozen shrimp
+รถยนต์ไฟฟ้า,870380,multilingual,th,electric car
+โทรศัพท์มือถือ,851712,multilingual,th,mobile phone
+ยางรถยนต์,401110,multilingual,th,car tyre
+น้ำตาลทราย,170199,multilingual,th,granulated sugar
+เสื้อยืดผ้าฝ้าย,610910,multilingual,th,cotton t-shirt
+gạo trắng,100630,multilingual,vi,white rice
+tôm đông lạnh,030617,multilingual,vi,frozen shrimp
+cà phê nhân,090111,multilingual,vi,raw coffee beans
+thép cuộn cán nóng,720839,multilingual,vi,hot rolled steel coil
+điện thoại thông minh,851712,multilingual,vi,smartphone
+xe ô tô điện,870380,multilingual,vi,electric car
+dầu thô,270900,multilingual,vi,crude oil
+笔记本电脑,847130,multilingual,zh,laptop computer
+冷冻虾,030617,multilingual,zh,frozen shrimp
+大米,100630,multilingual,zh,rice
+锂电池,850760,multilingual,zh,lithium battery
+棉质T恤,610910,multilingual,zh,cotton t-shirt
+原油,270900,multilingual,zh,crude oil
+English breakfast tea,090240,known_failure,en,black tea - code 090240 not in label space
+matcha green tea powder,090210,known_failure,en,tea variant - model often confuses with other categories
+oolong tea leaves 100g,090230,known_failure,en,semi-fermented tea - code 090230 not in label space
+chamomile herbal tea bags,121190,known_failure,en,herbal infusion - not tea chapter - not in label space
+fresh avocado,080440,known_failure,en,avocado code 080440 not in label space
+quinoa grain organic,100850,known_failure,en,quinoa code 100850 not in label space
+soy sauce 500ml bottle,210390,known_failure,en,soy sauce code 210390 not in label space
+hand sanitizer gel 70% alcohol,380894,known_failure,en,sanitizer code 380894 not in label space
+drone with 4K camera,880211,known_failure,en,UAV code not in label space
+solar panel 400W monocrystalline,854140,known_failure,en,maps to photosensitive devices - often misclassified

data/harmonized-system/harmonized-system.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/hs_codes_reference.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f917717590bf6f457bbefb65b2827aff3a11d029988dae3e1d315a4dbb54134
+size 10516

data/hts/us_hts_lookup.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f3fa20f7b7a0573536ad33fb7ba795b43ecc4e5d5ef544b2d605716291a44ad
+size 6988357

data/sample_documents/customs_zh.png ADDED Viewed

data/sample_documents/invoice_en.png ADDED Viewed

data/sample_documents/invoice_vi.png ADDED Viewed

data/sample_documents/packing_list_th.png ADDED Viewed

dataset/ATTRIBUTION.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Attribution
+This dataset includes and builds on HS nomenclature content from:
+- `datasets/harmonized-system`:
+  - <https://github.com/datasets/harmonized-system>
+  - Upstream references: WCO HS nomenclature materials and UN Comtrade API
+## Licensing
+- Upstream HS source data: **ODC Public Domain Dedication and License (PDDL) v1.0**
+  - <https://opendatacommons.org/licenses/pddl/1-0/>
+- Project-generated synthetic text examples and label normalization:
+  - **MIT License** (see this repository's `LICENSE`)
+## Notes
+- HS codes are international nomenclature identifiers and may vary by country-level tariff schedules/extensions.
+- Use this dataset for prototyping and research; verify classifications in official customs workflows.

dataset/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+pretty_name: HSClassify Micro Training Dataset
+license: pddl
+language:
+  - en
+  - th
+  - vi
+  - zh
+task_categories:
+  - text-classification
+task_ids:
+  - multi-class-classification
+size_categories:
+  - 10K<n<100K
+configs:
+  - config_name: default
+    data_files:
+      - split: train
+        path: training_data_indexed.csv
+---
+# Dataset Card for HSClassify Micro Training Dataset
+## Dataset Summary
+This dataset supports multilingual HS code classification for customs and trade workflows.
+It combines:
+- HS nomenclature records (6-digit level and hierarchy context)
+- Synthetic product descriptions mapped to HS codes
+- Human-readable chapter/category labels for UI and latent-space analysis
+## Included Files
+- `training_data_indexed.csv`: training rows with text, HS code, chapter metadata, and language.
+- `harmonized-system.csv`: source HS table snapshot used for data generation and indexing.
+- `hs_codes_reference.json`: curated HS reference used by the app and training pipeline.
+- `ATTRIBUTION.md`: explicit source and license attribution.
+## Data Fields (`training_data_indexed.csv`)
+- `text`: product description text used for embedding/classification.
+- `hs_code`: 6-digit HS code target.
+- `hs_chapter`: chapter description text.
+- `hs_chapter_code`: chapter ID (e.g., `HS 08`).
+- `hs_chapter_name`: normalized human-readable category label.
+- `hs_desc`: HS description aligned to `hs_code`.
+- `language`: language code (`en`, `th`, `vi`, `zh`).
+## Source Attribution
+Core HS nomenclature content is sourced from the `datasets/harmonized-system` project:
+- Repository: <https://github.com/datasets/harmonized-system>
+- Declared source chain in upstream metadata:
+  - WCO HS nomenclature documentation
+  - UN Comtrade data extraction API
+- Upstream data license: ODC Public Domain Dedication and License (PDDL) v1.0
+Project-added synthetic texts and normalized labels are released under this project's MIT license.
+## Limitations
+- Language balance is intentionally skewed toward English in the current snapshot.
+- Synthetic text patterns may not cover all commercial phrasing edge cases.
+- This dataset is for research/prototyping and is not legal customs advice.

field_extractor.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""
+Structured field extraction from OCR text of trade documents.
+Extracts:
+- Made in (country of origin)
+- Ship to (destination country)
+- Item price (numeric value)
+- Currency (USD, EUR, etc.)
+- Product description
+- Email addresses
+"""
+import re
+from typing import Optional
+# --- Country Matching ---
+COUNTRIES = {
+    # Major trading nations
+    "china": "CN", "peoples republic of china": "CN", "prc": "CN", "中国": "CN",
+    "united states": "US", "usa": "US", "u.s.a.": "US", "united states of america": "US",
+    "japan": "JP", "日本": "JP",
+    "germany": "DE", "deutschland": "DE",
+    "united kingdom": "GB", "uk": "GB", "great britain": "GB", "england": "GB",
+    "france": "FR",
+    "italy": "IT", "italia": "IT",
+    "south korea": "KR", "korea": "KR", "republic of korea": "KR", "한국": "KR",
+    "india": "IN",
+    "canada": "CA",
+    "australia": "AU",
+    "brazil": "BR",
+    "mexico": "MX",
+    "indonesia": "ID",
+    "thailand": "TH", "ไทย": "TH",
+    "vietnam": "VN", "viet nam": "VN", "việt nam": "VN",
+    "malaysia": "MY",
+    "singapore": "SG",
+    "taiwan": "TW", "chinese taipei": "TW",
+    "netherlands": "NL", "holland": "NL",
+    "spain": "ES", "españa": "ES",
+    "turkey": "TR", "türkiye": "TR",
+    "switzerland": "CH",
+    "saudi arabia": "SA",
+    "united arab emirates": "AE", "uae": "AE",
+    "poland": "PL",
+    "sweden": "SE",
+    "belgium": "BE",
+    "argentina": "AR",
+    "austria": "AT",
+    "norway": "NO",
+    "ireland": "IE",
+    "israel": "IL",
+    "denmark": "DK",
+    "philippines": "PH",
+    "colombia": "CO",
+    "pakistan": "PK",
+    "chile": "CL",
+    "finland": "FI",
+    "bangladesh": "BD",
+    "egypt": "EG",
+    "czech republic": "CZ", "czechia": "CZ",
+    "portugal": "PT",
+    "romania": "RO",
+    "new zealand": "NZ",
+    "greece": "GR",
+    "peru": "PE",
+    "south africa": "ZA",
+    "hungary": "HU",
+    "sri lanka": "LK",
+    "cambodia": "KH",
+    "myanmar": "MM", "burma": "MM",
+    "nigeria": "NG",
+    "kenya": "KE",
+    "ghana": "GH",
+    "ethiopia": "ET",
+    "tanzania": "TZ",
+    "morocco": "MA",
+    "hong kong": "HK",
+}
+# Reverse map: code -> name
+COUNTRY_CODE_TO_NAME = {}
+for name, code in COUNTRIES.items():
+    if code not in COUNTRY_CODE_TO_NAME:
+        COUNTRY_CODE_TO_NAME[code] = name.title()
+# Fix some names
+COUNTRY_CODE_TO_NAME["US"] = "United States"
+COUNTRY_CODE_TO_NAME["GB"] = "United Kingdom"
+COUNTRY_CODE_TO_NAME["CN"] = "China"
+COUNTRY_CODE_TO_NAME["KR"] = "South Korea"
+COUNTRY_CODE_TO_NAME["AE"] = "United Arab Emirates"
+COUNTRY_CODE_TO_NAME["NZ"] = "New Zealand"
+COUNTRY_CODE_TO_NAME["ZA"] = "South Africa"
+COUNTRY_CODE_TO_NAME["CZ"] = "Czech Republic"
+COUNTRY_CODE_TO_NAME["HK"] = "Hong Kong"
+COUNTRY_CODE_TO_NAME["TW"] = "Taiwan"
+COUNTRY_CODE_TO_NAME["SA"] = "Saudi Arabia"
+COUNTRY_CODE_TO_NAME["NL"] = "Netherlands"
+# All country names for dropdown
+ALL_COUNTRIES = sorted(set(COUNTRY_CODE_TO_NAME.values()))
+# --- Currency Matching ---
+CURRENCIES = {
+    "USD": "US Dollar",
+    "EUR": "Euro",
+    "GBP": "British Pound",
+    "JPY": "Japanese Yen",
+    "CNY": "Chinese Yuan",
+    "RMB": "Chinese Yuan",
+    "KRW": "Korean Won",
+    "THB": "Thai Baht",
+    "VND": "Vietnamese Dong",
+    "INR": "Indian Rupee",
+    "CAD": "Canadian Dollar",
+    "AUD": "Australian Dollar",
+    "SGD": "Singapore Dollar",
+    "MYR": "Malaysian Ringgit",
+    "IDR": "Indonesian Rupiah",
+    "PHP": "Philippine Peso",
+    "BRL": "Brazilian Real",
+    "MXN": "Mexican Peso",
+    "CHF": "Swiss Franc",
+    "SEK": "Swedish Krona",
+    "NOK": "Norwegian Krone",
+    "DKK": "Danish Krone",
+    "HKD": "Hong Kong Dollar",
+    "TWD": "Taiwan Dollar",
+    "AED": "UAE Dirham",
+    "SAR": "Saudi Riyal",
+    "ZAR": "South African Rand",
+    "NZD": "New Zealand Dollar",
+    "TRY": "Turkish Lira",
+    "PLN": "Polish Zloty",
+}
+CURRENCY_SYMBOLS = {
+    "$": "USD",
+    "€": "EUR",
+    "£": "GBP",
+    "¥": "JPY",
+    "₹": "INR",
+    "฿": "THB",
+    "₫": "VND",
+    "₩": "KRW",
+    "R$": "BRL",
+}
+def find_country(text: str, context_keywords: list[str]) -> Optional[str]:
+    """Find a country name near context keywords in the text."""
+    text_lower = text.lower()
+    # Try to find country near context keywords
+    for keyword in context_keywords:
+        # Search for keyword in text
+        pattern = re.compile(
+            rf'{keyword}\s*[:\-]?\s*(.{{2,50}})',
+            re.IGNORECASE
+        )
+        match = pattern.search(text)
+        if match:
+            fragment = match.group(1).strip().lower()
+            # Check if any country name is in the fragment
+            for country_name, code in sorted(COUNTRIES.items(), key=lambda x: -len(x[0])):
+                if country_name in fragment:
+                    return code
+            # Also check for ISO country codes (2 letters)
+            code_match = re.match(r'^([A-Z]{2})\b', match.group(1).strip())
+            if code_match:
+                c = code_match.group(1)
+                if c in COUNTRY_CODE_TO_NAME:
+                    return c
+    return None
+def extract_fields(ocr_text: str) -> dict:
+    """
+    Extract structured fields from OCR text of a trade document.
+    Returns dict with:
+        - email: str or None
+        - made_in: country code or None
+        - ship_to: country code or None
+        - item_price: float or None
+        - currency: currency code or None
+        - product_description: str or None
+        - raw_text: the original OCR text
+        - confidence: dict with confidence scores for each field
+    """
+    result = {
+        "email": None,
+        "made_in": None,
+        "made_in_name": None,
+        "ship_to": None,
+        "ship_to_name": None,
+        "item_price": None,
+        "currency": None,
+        "product_description": None,
+        "raw_text": ocr_text,
+        "confidence": {},
+    }
+    if not ocr_text or not ocr_text.strip():
+        return result
+    text = ocr_text.strip()
+    # --- Extract Email ---
+    email_pattern = re.compile(
+        r'[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}',
+        re.IGNORECASE
+    )
+    email_match = email_pattern.search(text)
+    if email_match:
+        result["email"] = email_match.group(0)
+        result["confidence"]["email"] = 0.95
+    # --- Extract Country of Origin (Made in) ---
+    origin_keywords = [
+        "made in", "manufactured in", "produced in", "origin",
+        "country of origin", "country of manufacture",
+        "mfg country", "mfg. country", "fabricated in",
+        "assembled in", "place of origin", "product of",
+        "sourced from", "shipped from", "exporting country",
+        "from"
+    ]
+    origin_code = find_country(text, origin_keywords)
+    if origin_code:
+        result["made_in"] = origin_code
+        result["made_in_name"] = COUNTRY_CODE_TO_NAME.get(origin_code, origin_code)
+        result["confidence"]["made_in"] = 0.85
+    # --- Extract Destination (Ship to) ---
+    dest_keywords = [
+        "ship to", "shipped to", "deliver to", "delivery to",
+        "destination", "consignee", "import to", "importing country",
+        "port of discharge", "port of destination", "final destination",
+        "to country", "dest", "buyer country",
+        "bill to", "sold to"
+    ]
+    dest_code = find_country(text, dest_keywords)
+    if dest_code:
+        result["ship_to"] = dest_code
+        result["ship_to_name"] = COUNTRY_CODE_TO_NAME.get(dest_code, dest_code)
+        result["confidence"]["ship_to"] = 0.80
+    # --- Extract Currency ---
+    # First check for currency symbols
+    for symbol, curr_code in sorted(CURRENCY_SYMBOLS.items(), key=lambda x: -len(x[0])):
+        if symbol in text:
+            result["currency"] = curr_code
+            result["confidence"]["currency"] = 0.90
+            break
+    # Then check for explicit currency codes
+    if not result["currency"]:
+        for curr_code in CURRENCIES:
+            pattern = re.compile(rf'\b{curr_code}\b', re.IGNORECASE)
+            if pattern.search(text):
+                result["currency"] = curr_code
+                result["confidence"]["currency"] = 0.95
+                break
+    # --- Extract Price ---
+    price_patterns = [
+        # "price: $123.45" or "amount: 123.45 USD"
+        re.compile(
+            r'(?:price|amount|total|value|unit price|item price|cost|fob value|cif value|invoice value)\s*[:\-]?\s*'
+            r'(?:[A-Z]{3}\s*)?'
+            r'[\$€£¥₹฿₫₩]?\s*'
+            r'([\d,]+\.?\d*)',
+            re.IGNORECASE
+        ),
+        # "$123.45" or "€99.99"
+        re.compile(
+            r'[\$€£¥₹฿₫₩]\s*([\d,]+\.?\d*)'
+        ),
+        # "123.45 USD" or "99.99 EUR"
+        re.compile(
+            r'([\d,]+\.?\d*)\s*(?:USD|EUR|GBP|JPY|CNY|RMB|THB|VND|INR|CAD|AUD|SGD|MYR)',
+            re.IGNORECASE
+        ),
+    ]
+    for pattern in price_patterns:
+        match = pattern.search(text)
+        if match:
+            price_str = match.group(1).replace(",", "")
+            try:
+                price = float(price_str)
+                if 0 < price < 1e12:  # Sanity check
+                    result["item_price"] = price
+                    result["confidence"]["item_price"] = 0.80
+                    break
+            except ValueError:
+                continue
+    # --- Extract Product Description ---
+    desc_keywords = [
+        "description", "product description", "item description",
+        "goods description", "description of goods",
+        "commodity", "product name", "item name",
+        "goods", "merchandise", "articles"
+    ]
+    for keyword in desc_keywords:
+        pattern = re.compile(
+            rf'{keyword}\s*[:\-]?\s*(.{{10,300}}?)(?:\n|$)',
+            re.IGNORECASE
+        )
+        match = pattern.search(text)
+        if match:
+            desc = match.group(1).strip()
+            # Clean up
+            desc = re.sub(r'\s+', ' ', desc)
+            if len(desc) > 10:
+                result["product_description"] = desc
+                result["confidence"]["product_description"] = 0.75
+                break
+    # If no structured description found, use the longest non-header line
+    if not result["product_description"]:
+        lines = [l.strip() for l in text.split('\n') if l.strip() and len(l.strip()) > 15]
+        # Filter out lines that look like headers/labels
+        content_lines = [
+            l for l in lines
+            if not re.match(r'^(invoice|bill|date|ref|no\.|number|email|phone|fax|tel|address)', l, re.IGNORECASE)
+            and not re.match(r'^[A-Z\s]{2,20}:$', l)
+        ]
+        if content_lines:
+            # Pick the longest line as likely description
+            best = max(content_lines, key=len)
+            result["product_description"] = best[:300]
+            result["confidence"]["product_description"] = 0.40
+    return result
+def get_all_countries() -> list[dict]:
+    """Return list of all countries for dropdowns."""
+    return [
+        {"code": code, "name": name}
+        for code, name in sorted(COUNTRY_CODE_TO_NAME.items(), key=lambda x: x[1])
+    ]
+def get_all_currencies() -> list[dict]:
+    """Return list of all currencies for dropdowns."""
+    return [
+        {"code": code, "name": name}
+        for code, name in sorted(CURRENCIES.items(), key=lambda x: x[1])
+    ]

hs_dataset.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""
+Harmonized System dataset integration.
+Loads the official HS code dataset from:
+https://github.com/datasets/harmonized-system
+Provides:
+- Full HS code lookup (2, 4, 6 digit)
+- Section/chapter/heading/subheading hierarchy
+- HTS extension support (country-specific 7-10 digit codes)
+- Search by description
+"""
+import csv
+import json
+import os
+import re
+from pathlib import Path
+from typing import Optional
+PROJECT_DIR = Path(__file__).parent
+HS_DATA_PATH = PROJECT_DIR / "data" / "harmonized-system" / "harmonized-system.csv"
+US_HTS_LOOKUP_PATH = PROJECT_DIR / "data" / "hts" / "us_hts_lookup.json"
+class HSDataset:
+    """Harmonized System code dataset."""
+    def __init__(self):
+        self.codes = {}        # hscode -> {section, description, parent, level}
+        self.sections = {}     # section number -> section name
+        self.chapters = {}     # 2-digit -> description
+        self.headings = {}     # 4-digit -> description
+        self.subheadings = {}  # 6-digit -> description
+        self._loaded = False
+    def load(self) -> bool:
+        """Load the HS dataset from CSV."""
+        if self._loaded:
+            return True
+        if not HS_DATA_PATH.exists():
+            print(f"HS dataset not found at {HS_DATA_PATH}")
+            return False
+        with open(HS_DATA_PATH, 'r', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                hscode = row['hscode'].strip()
+                desc = row['description'].strip()
+                section = row['section'].strip()
+                parent = row['parent'].strip()
+                level = int(row['level'])
+                self.codes[hscode] = {
+                    'section': section,
+                    'description': desc,
+                    'parent': parent,
+                    'level': level,
+                }
+                if level == 2:
+                    self.chapters[hscode] = desc
+                elif level == 4:
+                    self.headings[hscode] = desc
+                elif level == 6:
+                    self.subheadings[hscode] = desc
+        self._loaded = True
+        print(f"Loaded HS dataset: {len(self.chapters)} chapters, "
+              f"{len(self.headings)} headings, {len(self.subheadings)} subheadings")
+        return True
+    def lookup(self, hscode: str) -> Optional[dict]:
+        """Look up an HS code and return full hierarchy."""
+        hscode = hscode.strip().replace('.', '').replace(' ', '')
+        if hscode not in self.codes:
+            return None
+        entry = self.codes[hscode].copy()
+        # Build hierarchy
+        hierarchy = []
+        current = hscode
+        while current and current in self.codes and current != 'TOTAL':
+            hierarchy.insert(0, {
+                'code': current,
+                'description': self.codes[current]['description'],
+                'level': self.codes[current]['level'],
+            })
+            current = self.codes[current]['parent']
+        entry['hierarchy'] = hierarchy
+        entry['hscode'] = hscode
+        # Get chapter and heading descriptions
+        if len(hscode) >= 2:
+            ch = hscode[:2]
+            entry['chapter'] = self.chapters.get(ch, '')
+            entry['chapter_code'] = ch
+        if len(hscode) >= 4:
+            hd = hscode[:4]
+            entry['heading'] = self.headings.get(hd, '')
+            entry['heading_code'] = hd
+        if len(hscode) == 6:
+            entry['subheading'] = self.subheadings.get(hscode, '')
+        return entry
+    def search(self, query: str, max_results: int = 20) -> list[dict]:
+        """Search HS codes by description text."""
+        query_lower = query.lower()
+        query_words = set(query_lower.split())
+        results = []
+        for hscode, info in self.codes.items():
+            if info['level'] != 6:
+                continue
+            desc_lower = info['description'].lower()
+            # Score by word overlap
+            desc_words = set(desc_lower.split())
+            overlap = query_words & desc_words
+            if overlap:
+                score = len(overlap) / len(query_words)
+                # Bonus for exact substring match
+                if query_lower in desc_lower:
+                    score += 1.0
+                results.append({
+                    'hscode': hscode,
+                    'description': info['description'],
+                    'section': info['section'],
+                    'score': score,
+                })
+        results.sort(key=lambda x: -x['score'])
+        return results[:max_results]
+    def get_chapter_name(self, chapter_code: str) -> str:
+        """Get chapter description from 2-digit code."""
+        return self.chapters.get(chapter_code.zfill(2), 'Unknown')
+    def validate_hs_code(self, hscode: str) -> dict:
+        """Validate an HS code and return info about its validity."""
+        hscode = hscode.strip().replace('.', '').replace(' ', '')
+        result = {
+            'valid': False,
+            'code': hscode,
+            'level': None,
+            'description': None,
+            'message': '',
+        }
+        if not re.match(r'^\d{2,6}$', hscode):
+            result['message'] = 'HS code must be 2-6 digits'
+            return result
+        if hscode in self.codes:
+            info = self.codes[hscode]
+            result['valid'] = True
+            result['level'] = info['level']
+            result['description'] = info['description']
+            result['message'] = f'Valid {info["level"]}-digit HS code'
+        else:
+            # Check if partial code is valid
+            if len(hscode) == 6:
+                heading = hscode[:4]
+                chapter = hscode[:2]
+                if heading in self.codes:
+                    result['message'] = f'Heading {heading} exists but subheading {hscode} not found'
+                elif chapter in self.codes:
+                    result['message'] = f'Chapter {chapter} exists but code {hscode} not found'
+                else:
+                    result['message'] = f'Code {hscode} not found in HS nomenclature'
+        return result
+    def get_all_6digit_codes(self) -> list[dict]:
+        """Return all 6-digit HS codes with descriptions."""
+        return [
+            {'hscode': code, 'description': info['description'], 'section': info['section']}
+            for code, info in self.codes.items()
+            if info['level'] == 6
+        ]
+# --- HTS Extensions ---
+# HTS (Harmonized Tariff Schedule) adds country-specific digits (7-10) after the 6-digit HS code.
+# This is a simplified reference for major trading partners.
+def _load_us_hts_extensions() -> dict:
+    """Load US HTS extensions from the pre-built JSON lookup table."""
+    if not US_HTS_LOOKUP_PATH.exists():
+        return {}
+    with open(US_HTS_LOOKUP_PATH, "r", encoding="utf-8") as f:
+        raw = json.load(f)
+    # Convert from build_hts_lookup format to API format
+    extensions = {}
+    for hs6, entries in raw.items():
+        extensions[hs6] = [
+            {"hts": e["hts_code"], "description": e["description"],
+             "general_duty": e.get("general_duty", ""),
+             "special_duty": e.get("special_duty", ""),
+             "unit": e.get("unit", "")}
+            for e in entries
+        ]
+    return extensions
+# Lazy-loaded cache for US HTS data
+_us_hts_cache = None
+def _get_us_hts_extensions() -> dict:
+    global _us_hts_cache
+    if _us_hts_cache is None:
+        _us_hts_cache = _load_us_hts_extensions()
+    return _us_hts_cache
+HTS_EXTENSIONS = {
+    "US": {
+        "name": "United States HTS",
+        "digits": 10,
+        "format": "XXXX.XX.XXXX",
+        # Extensions loaded lazily from us_hts_lookup.json
+        "extensions": None,  # Sentinel — resolved in get_hts_extensions()
+    },
+    "EU": {
+        "name": "EU Combined Nomenclature (CN)",
+        "digits": 8,
+        "format": "XXXX.XX.XX",
+        "extensions": {
+            "851712": [
+                {"hts": "85171200", "description": "Telephones for cellular networks; smartphones"},
+            ],
+            "847130": [
+                {"hts": "84713000", "description": "Portable digital automatic data-processing machines, ≤ 10 kg"},
+            ],
+            "870380": [
+                {"hts": "87038000", "description": "Other vehicles, with electric motor for propulsion"},
+            ],
+        }
+    },
+    "CN": {
+        "name": "China Customs Tariff",
+        "digits": 10,
+        "format": "XXXX.XXXX.XX",
+        "extensions": {
+            "851712": [
+                {"hts": "8517120010", "description": "Smartphones, 5G capable"},
+                {"hts": "8517120090", "description": "Other mobile phones"},
+            ],
+            "847130": [
+                {"hts": "8471300000", "description": "Portable digital data processing machines"},
+            ],
+        }
+    },
+    "JP": {
+        "name": "Japan HS Tariff",
+        "digits": 9,
+        "format": "XXXX.XX.XXX",
+        "extensions": {
+            "851712": [
+                {"hts": "851712000", "description": "Telephones for cellular networks or wireless"},
+            ],
+            "870380": [
+                {"hts": "870380000", "description": "Electric motor vehicles for passenger transport"},
+            ],
+        }
+    },
+}
+def get_hts_extensions(hs_code: str, country_code: str) -> Optional[dict]:
+    """
+    Get HTS (country-specific) extensions for a 6-digit HS code.
+    Args:
+        hs_code: 6-digit HS code
+        country_code: 2-letter country code (US, EU, CN, JP, etc.)
+    Returns:
+        Dict with country HTS info and available extensions, or None.
+    """
+    hs_code = hs_code.strip().replace('.', '').replace(' ', '')
+    country_code = country_code.upper().strip()
+    if country_code not in HTS_EXTENSIONS:
+        return {
+            "available": False,
+            "country": country_code,
+            "message": f"HTS extensions not available for {country_code}. "
+                       f"Available: {', '.join(HTS_EXTENSIONS.keys())}",
+            "extensions": [],
+        }
+    tariff = HTS_EXTENSIONS[country_code]
+    # US extensions are lazy-loaded from JSON
+    if country_code == "US":
+        ext_dict = _get_us_hts_extensions()
+    else:
+        ext_dict = tariff["extensions"]
+    extensions = ext_dict.get(hs_code, [])
+    return {
+        "available": True,
+        "country": country_code,
+        "tariff_name": tariff["name"],
+        "total_digits": tariff["digits"],
+        "format": tariff["format"],
+        "extensions": extensions,
+        "hs_code": hs_code,
+        "message": f"Found {len(extensions)} HTS extension(s)" if extensions else
+                   f"No specific extensions found for {hs_code} in {tariff['name']}. "
+                   f"The base HS code {hs_code} applies.",
+    }
+def get_available_hts_countries() -> list[dict]:
+    """Return list of countries with HTS extensions available."""
+    return [
+        {"code": code, "name": info["name"], "digits": info["digits"]}
+        for code, info in HTS_EXTENSIONS.items()
+    ]
+# Singleton instance
+_dataset = HSDataset()
+def get_dataset() -> HSDataset:
+    """Get the singleton HSDataset instance, loading if necessary."""
+    if not _dataset._loaded:
+        _dataset.load()
+    return _dataset

models/.gitkeep ADDED Viewed

File without changes

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ -r requirements.txt
2	+ datasets>=3.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi==0.129.0
+uvicorn[standard]==0.41.0
+sentence-transformers==5.2.3
+transformers==5.2.0
+torch==2.10.0
+scikit-learn==1.8.0
+pandas==3.0.1
+numpy==2.3.5
+plotly==6.5.2
+umap-learn==0.5.11
+jinja2==3.1.6
+pytesseract==0.3.13
+pdf2image==1.17.0
+pillow==12.1.1
+python-multipart==0.0.22

static/.gitkeep ADDED Viewed

File without changes

templates/index.html ADDED Viewed

The diff for this file is too large to render. See raw diff