Kuangdai commited on 6 days ago

Commit

6fb6c07

0 Parent(s):

Initial release of SoilFormer

Files changed (42) hide show

.gitattributes +9 -0
.gitignore +181 -0
LICENSE +21 -0
README.md +168 -0
config/column_rules_numeric.json +30 -0
config/config_data.json +15 -0
config/config_model.json +25 -0
config/config_train.json +51 -0
data/cat_vocab.json +3 -0
data/numeric_vocab.json +3 -0
data/photo_map.json +3 -0
data/tabular_data.csv +3 -0
data/tabular_meta.json +3 -0
data/tabular_meta_numeric_stats.csv +3 -0
example/input_card.json +3 -0
example/input_card__masked.json +3 -0
example/input_card__unmasked.json +3 -0
example/output_card.json +3 -0
example/output_card__acc.json +3 -0
inference_create_input_card.py +318 -0
inference_predict_output_card.py +545 -0
model_weights/gemma3n_E2B_vision_only/config.json +3 -0
model_weights/gemma3n_E2B_vision_only/model.safetensors +3 -0
model_weights/gemma3n_E2B_vision_only/modeling_gemma3n.py +3 -0
model_weights/gemma3n_E2B_vision_only/processor_config.json +3 -0
model_weights/gemma3n_E2B_vision_only/tokenizer.json +3 -0
model_weights/gemma3n_E2B_vision_only/tokenizer_config.json +3 -0
model_weights/gemma3n_E2B_vision_only/vision_extractor_config.json +3 -0
model_weights/soilformer_pretrain/hetero_epoch_200.pt +3 -0
modelling/__init__.py +0 -0
modelling/decode_categorical.py +423 -0
modelling/decode_numeric.py +238 -0
modelling/embed_categorical.py +322 -0
modelling/embed_numeric.py +547 -0
modelling/embed_vision_gemma3n.py +552 -0
modelling/layer.py +353 -0
modelling/loader.py +1025 -0
modelling/soilformer.py +696 -0
modelling/train.py +552 -0
modelling/utils.py +132 -0
requirements.txt +10 -0
resources/arch.png +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+# Auto detect text files and perform LF normalization
+* text=auto
+model_weights/** filter=lfs diff=lfs merge=lfs -text
+data/*.csv filter=lfs diff=lfs merge=lfs -text
+data/*.json filter=lfs diff=lfs merge=lfs -text
+example/*.json filter=lfs diff=lfs merge=lfs -text
+resources/*.png filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,181 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Kuangdai
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,168 @@

+---
+license: mit
+library_name: pytorch
+language:
+- en
+tags:
+- soil
+- soil-science
+- earth-science
+- environmental-science
+- multimodal
+- tabular
+- transformer
+- representation-learning
+- masked-feature-modeling
+- remote-sensing
+- europe
+datasets:
+- earthroverprogram/lucas-mega
+---
+# SoilFormer
+A multimodal tabular transformer trained on [LUCAS-MEGA](https://huggingface.co/datasets/earthroverprogram/lucas-mega).
+[Manuscript](https://huggingface.co/datasets/earthroverprogram/lucas-mega/manuscript.pdf)
+## Introduction
+SoilFormer is a multimodal transformer for representation learning in soil–environment systems. It is trained on
+LUCAS-MEGA, a large-scale dataset built from European soil and environmental observations, with the LUCAS soil survey as
+its backbone. LUCAS-MEGA integrates heterogeneous sources into a machine-learning-ready sample–feature table, covering
+numerical, categorical, textual, and visual modalities across soil physical, chemical, hydrological, environmental, and
+site-related properties.
+SoilFormer learns from partially observed multimodal samples using masked feature modeling. During training, a subset of
+observed categorical and numerical features is masked, and the model reconstructs them from the remaining tabular and
+visual context. The architecture combines grouped categorical embedding, grouped numerical encoding/decoding, vision
+feature extraction and compression, transformer layers, and heteroscedastic prediction heads for uncertainty-aware
+reconstruction.
+<img src="resources/arch.png" alt="SoilFormer architecture" width="70%">
+## Training
+Train SoilFormer with:
+```bash
+python modelling/train.py
+```
+Main configuration files:
+* `config/config_model.json`: model architecture parameters, including embedding sizes, transformer layer settings,
+  decoder settings, dtype, and vision model configuration.
+* `config/config_data.json`: data parameters, including CSV path, vocab paths, numeric statistics, photo mapping, image
+  root, train/eval split, batch size, and masking ratios.
+* `config/config_train.json`: training hyperparameters, including runtime device, seed, optimizer settings, scheduler
+  settings, checkpoint behavior, loss options, logging, and output paths.
+## Inference
+Inference uses readable JSON input cards. The workflow is:
+1. Create input cards from one dataset row.
+2. Edit the masked card manually if desired.
+3. Run model prediction from the edited card.
+4. Optionally compare predictions against the unmasked answer card.
+### 1. Create input cards
+```bash
+python create_input_card_from_dataset.py \
+  --row_index 10 \
+  --output example/input_card.json
+```
+This writes two files:
+```text
+example/input_card__unmasked.json
+example/input_card__masked.json
+```
+The unmasked card contains the raw readable values from the CSV row. The masked card randomly replaces a fraction of
+categorical and numeric values with `null`. Natural missing values remain as empty strings `""`, while active masks are
+represented as `null`.
+Default masking ratios are 0.15 for both categorical and numeric features:
+```bash
+python create_input_card_from_dataset.py \
+  --row_index 10 \
+  --output example/input_card.json \
+  --cat_mask_ratio 0.15 \
+  --num_mask_ratio 0.15 \
+  --seed 42
+```
+The card format is intentionally simple and user-editable. Users can copy this card as a template, replace the values
+with their own soil sample information, and set variables to `null` to indicate which fields should be predicted during
+inference:
+```json
+{
+  "categorical": {
+    "land_site:land_cover_primary": "B16: Cropland => Cereals => Maize",
+    "land_site:land_use_primary": null,
+    "soil_type:WRB_soil_group": "Cambisol",
+    "texture:ISSS_class": "silty clay",
+    "...": "..."
+  },
+  "numeric": {
+    "carbon:CaCO3_content (g/kg)": 7.0,
+    "carbon:SOC_saturation_ratio": 0.3647958934307098,
+    "geographic:latitude (deg)": 38.8513900000485,
+    "geographic:longitude (deg)": -9.29050000007487,
+    "mass_density:bulk_density (g/cm³)": null,
+    "...": "..."
+  },
+  "vision": {
+    "image_path_suffix": "relative/path/to/photo.jpg"
+  }
+}
+```
+### 2. Run prediction
+```bash
+python inference_predict_output_card.py \
+  --checkpoint model_weights/soilformer_pretrain/hetero_epoch_200.pt \
+  --input_card example/input_card__masked.json \
+  --output example/output_card.json
+```
+This writes:
+```text
+example/output_card.json
+```
+`output_card.json` contains readable predictions:
+* categorical outputs are decoded back to raw category labels;
+* numeric outputs are converted from z-score space back to the original physical units;
+* vision input is read from `vision.image_path_suffix` together with `photo_root` in `config/config_data.json`.
+### 3. Evaluation with an answer card
+```bash
+python inference_predict_output_card.py \
+  --checkpoint model_weights/soilformer_pretrain/hetero_epoch_200.pt \
+  --input_card example/input_card__masked.json \
+  --answer_card example/input_card__unmasked.json \
+  --output example/output_card.json
+```
+This additionally writes:
+```text
+example/output_card__acc.json
+```
+When `--answer_card` is provided, `output_card__acc.json` reports reconstruction metrics over fields that are `null` in
+the masked input card:
+* categorical accuracy for masked categorical fields;
+* numeric MAE for masked numeric fields, measured in the original feature units.

config/column_rules_numeric.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "texture:silt_percentage (%)": ">=0",
+  "chemical:pH_in_H2O": ">0",
+  "chemical:pH_in_CaCl2": ">0",
+  "carbon:organic_carbon_content (g/kg)": ">0",
+  "carbon:CaCO3_content (g/kg)": ">0",
+  "carbon:observed_vs_typical_soc_index_confidence_zone": "exclude",
+  "carbon:observed_vs_typical_soc_index": "exclude",
+  "fertility:N_extractable (g/kg)": ">0",
+  "fertility:K_extractable (mg/kg)": ">0",
+  "fertility:P_extractable (mg/kg)": ">0",
+  "fertility:P_available_stock (kg ha⁻¹)": ">0",
+  "land_degradation:soil_erosion_exceeding_10Mg_ha_yr (t ha⁻¹ yr⁻¹)": "exclude",
+  "crop_plant:cover_crop_fraction_5th_percentile (‱)": "exclude",
+  "crop_plant:cover_crop_fraction_95th_percentile (‱)": "exclude",
+  "mass_density:bulk_density_0_10cm (g/cm³)": ">0",
+  "mass_density:bulk_density_10_20cm (g/cm³)": ">0",
+  "mass_density:bulk_density (g/cm³)": ">0",
+  "biodiversity:land_use_change_pressure_index": "exclude",
+  "biodiversity:genetically_modified_organism_use_pressure_index": "exclude",
+  "trace_elements:Zn_concentration_5th_percentile (mg/kg)": "exclude",
+  "trace_elements:Zn_concentration_95th_percentile (mg/kg)": "exclude",
+  "trace_elements:As_concentration_std (log10 mg/kg)": "exclude",
+  "trace_elements:As_concentration_skewness": "exclude",
+  "trace_elements:As_concentration_kurtosis": "exclude",
+  "trace_elements:Hg_residual (µg/kg)": "exclude",
+  "climate:monthly_temperature_JAN_to_DEC (°C)": ">-100",
+  "climate:monthly_precipitation_JAN_to_DEC (mm)": ">-100",
+  "topography_geology:elevation (m)": "<4000"
+}

config/config_data.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "data_csv_path": "data/tabular_data.csv",
+  "photo_map_path": "data/photo_map.json",
+  "cat_vocab_path": "data/cat_vocab.json",
+  "numeric_vocab_path": "data/numeric_vocab.json",
+  "numeric_stats_path": "data/tabular_meta_numeric_stats.csv",
+  "photo_root": "",
+  "image_size": 512,
+  "train_ratio": 0.8,
+  "train_eval_split_seed": 42,
+  "batch_size": 64,
+  "cat_mask_ratio": 0.15,
+  "num_mask_ratio": 0.15,
+  "active_mask_seed": 42
+}

config/config_model.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "dtype": "float32",
+  "tabular_meta": "data/tabular_meta.json",
+  "vision_model_dir": "./model_weights/gemma3n_E2B_vision_only",
+  "vision_num_output_tokens_reduced": 32,
+  "vision_num_heads_for_token_reduction": 4,
+  "vision_reducer_bottleneck_dim": 768,
+  "vision_reducer_project_back": false,
+  "cat_vocab_json": "data/cat_vocab.json",
+  "cat_hidden_size": 768,
+  "cat_decode_middle_size": null,
+  "numeric_vocab_json": "data/numeric_vocab.json",
+  "numeric_hidden_size": 768,
+  "numeric_encode_middle_size": null,
+  "numeric_decode_middle_size": null,
+  "layer_num_query_heads": 8,
+  "layer_num_kv_heads": 2,
+  "layer_head_dim": 128,
+  "layer_mlp_ratio": 1.5,
+  "layer_dropout": 0.1,
+  "layer_num_layers": 4,
+  "disable_tabular_attention_mask": true,
+  "cat_homoscedastic": false,
+  "num_homoscedastic": false
+}

config/config_train.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "paths": {
+    "config_data_path": "config/config_data.json",
+    "config_model_path": "config/config_model.json",
+    "output_dir": "runs/soilformer_hetero"
+  },
+  "seed": {
+    "seed": 42,
+    "deterministic": true
+  },
+  "runtime": {
+    "device": "cuda",
+    "num_epochs": 500,
+    "init_weight_std": 0.02
+  },
+  "optimization": {
+    "lr": 1e-4,
+    "beta1": 0.9,
+    "beta2": 0.999,
+    "eps": 1e-8,
+    "weight_decay": 0.02,
+    "max_grad_norm": 1.0,
+    "scheduler": {
+      "type": "cosine",
+      "total_epochs": 500,
+      "eta_min": 2e-5,
+      "warmup_epochs": 5,
+      "warmup_start_factor": 0.1
+    }
+  },
+  "loss": {
+    "cat_s_bound": 2,
+    "num_s_bound": 4
+  },
+  "checkpoint": {
+    "resume_checkpoint_path": null,
+    "epochs_per_save": 100,
+    "max_saved_checkpoints": 5
+  },
+  "logging": {
+    "tqdm": true,
+    "wandb": {
+      "enabled": true,
+      "project": "soilformer",
+      "entity": "kuangdai-leng",
+      "run_name": "train-hetero",
+      "mode": "online",
+      "dir": null
+    }
+  }
+}

data/cat_vocab.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da160e500f0bf01207642f39b666d84d2787fae0f8ec21bb630e10e079780843
+size 14934

data/numeric_vocab.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddcfc729da9e6f5830d58f6b53928a6fa6dcd108a0ddac3eb7fe67abed3dcadc
+size 17492

data/photo_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:018b7f5baaa58e8e3e5e2c6cf98d02aa547a13c6de55f1628984010fc331235c
+size 4651435

data/tabular_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e87d387791bb95e3accab9afee8bda1e7e8722bad6e75d04c47a56787b24608
+size 103677102

data/tabular_meta.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de393215e2bbe46b111cc5b604b7fd04c14d28a634a52b06dcb94fd7073200eb
+size 84654

data/tabular_meta_numeric_stats.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73ef56458b2d7d6cbb730b153a7fd9f445dba4a96d6f29483364e38a9102c150
+size 7714

example/input_card.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82977f50ba8a6a7d7542a4098434d232ad0feff5b1797b088a99b78504604420
+size 6114

example/input_card__masked.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fadbf9afee9bd61fb87c4fd174306bcf6cae441e973b50afdbebd4bd433cb0be
+size 5902

example/input_card__unmasked.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82977f50ba8a6a7d7542a4098434d232ad0feff5b1797b088a99b78504604420
+size 6114

example/output_card.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c11d1ae2c9ada543038ca38dcb5a2a496a8392ca52b07cea215cfd46f0172af0
+size 7261

example/output_card__acc.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d76e4284a3c9e5be054cceb8f96bb5c20434dc1ea11f1904a2d1663d910efd4e
+size 3388

inference_create_input_card.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import argparse
+import ast
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, Optional
+import numpy as np
+import pandas as pd
+from modelling.utils import load_json
+def to_jsonable(value: Any) -> Any:
+    if value is None:
+        return None
+    if isinstance(value, float) and pd.isna(value):
+        return None
+    if isinstance(value, np.generic):
+        return value.item()
+    return value
+def parse_optional_int(value: Optional[str]) -> Optional[int]:
+    if value is None:
+        return None
+    value = str(value).strip().lower()
+    if value in {"", "none", "null", "random"}:
+        return None
+    return int(value)
+def choose_row_index(num_rows: int, row_index: Optional[int], seed: int) -> int:
+    if num_rows <= 0:
+        raise RuntimeError("CSV has no rows")
+    if row_index is None:
+        return random.Random(seed).randrange(num_rows)
+    if row_index < 0 or row_index >= num_rows:
+        raise IndexError(f"row_index out of range: {row_index}; num_rows={num_rows}")
+    return row_index
+def validate_ratio(name: str, value: float) -> float:
+    value = float(value)
+    if not 0.0 <= value <= 1.0:
+        raise ValueError(f"{name} must be in [0, 1], got {value}")
+    return value
+def load_json_if_exists(path: Optional[str]) -> Optional[Dict[str, Any]]:
+    if not path:
+        return None
+    p = Path(path)
+    if not p.exists() or not p.is_file():
+        return None
+    return load_json(str(p))
+def get_categorical_columns(config_data: Dict[str, Any]) -> list[str]:
+    cat_vocab = load_json_if_exists(config_data.get("cat_vocab_path"))
+    if not isinstance(cat_vocab, dict):
+        return []
+    return list(cat_vocab.keys())
+def get_numeric_columns(config_data: Dict[str, Any]) -> list[str]:
+    numeric_vocab = load_json_if_exists(config_data.get("numeric_vocab_path"))
+    if not isinstance(numeric_vocab, dict):
+        return []
+    columns: list[str] = []
+    for group in numeric_vocab.get("groups", []):
+        for name in group.get("feature_names", []):
+            columns.append(str(name))
+    return columns
+def get_vision_input(config_data: Dict[str, Any], row: Dict[str, Any]) -> Dict[str, Any]:
+    photo_map = load_json_if_exists(config_data.get("photo_map_path"))
+    id_column = str(config_data.get("id_column", "id"))
+    sample_id = row.get(id_column)
+    if not isinstance(photo_map, dict) or sample_id is None:
+        return {"image_path_suffix": ""}
+    relative_path = photo_map.get(sample_id)
+    if relative_path is None:
+        relative_path = photo_map.get(str(sample_id))
+    if relative_path is None or relative_path == "":
+        return {"image_path_suffix": ""}
+    return {"image_path_suffix": str(relative_path)}
+def parse_numeric_value(value: Any) -> Any:
+    """
+    Convert known numeric CSV cells into readable JSON numbers.
+    Loader convention:
+    - missing numeric cell is ""
+    - scalar numeric cell is something like "12.3"
+    - vector numeric cell is something like "[1.2, 3.4]"
+    """
+    value = to_jsonable(value)
+    if value == "" or value is None:
+        return ""
+    if isinstance(value, (int, float)) and not isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        s = value.strip()
+        if s == "":
+            return ""
+        if s.startswith("[") and s.endswith("]"):
+            parsed = ast.literal_eval(s)
+            if not isinstance(parsed, (list, tuple)):
+                raise ValueError(f"Expected numeric vector list, got: {value!r}")
+            return [float(x) for x in parsed]
+        return float(s)
+    return value
+def create_unmasked_card(
+    row: Dict[str, Any],
+    cat_columns: list[str],
+    numeric_columns: list[str],
+    vision: Dict[str, Any],
+) -> Dict[str, Any]:
+    categorical = {col: row.get(col, "") for col in cat_columns if col in row}
+    numeric = {
+        col: parse_numeric_value(row.get(col, ""))
+        for col in numeric_columns
+        if col in row
+    }
+    return {
+        "categorical": categorical,
+        "numeric": numeric,
+        "vision": vision,
+    }
+def choose_mask_keys(values: Dict[str, Any], ratio: float, rng: random.Random) -> list[str]:
+    valid_keys = [k for k, v in values.items() if v not in ("", None)]
+    if ratio <= 0.0 or not valid_keys:
+        return []
+    k = int(round(len(valid_keys) * ratio))
+    k = max(0, min(k, len(valid_keys)))
+    if k == 0:
+        return []
+    return rng.sample(valid_keys, k)
+def create_masked_card(
+    unmasked_card: Dict[str, Any],
+    cat_mask_ratio: float,
+    num_mask_ratio: float,
+    seed: int,
+) -> Dict[str, Any]:
+    rng = random.Random(seed)
+    masked = json.loads(json.dumps(unmasked_card, ensure_ascii=False))
+    cat_keys = choose_mask_keys(masked["categorical"], cat_mask_ratio, rng)
+    num_keys = choose_mask_keys(masked["numeric"], num_mask_ratio, rng)
+    for key in cat_keys:
+        masked["categorical"][key] = None
+    for key in num_keys:
+        masked["numeric"][key] = None
+    return masked
+def output_paths_from_given_name(given_name: str) -> tuple[Path, Path]:
+    path = Path(given_name)
+    base = path.with_suffix("") if path.suffix == ".json" else path
+    unmasked_path = base.with_name(base.name + "__unmasked.json")
+    masked_path = base.with_name(base.name + "__masked.json")
+    return unmasked_path, masked_path
+def create_cards(
+    config_data_path: str,
+    row_index: Optional[int],
+    seed: int,
+    cat_mask_ratio: float,
+    num_mask_ratio: float,
+) -> tuple[Dict[str, Any], Dict[str, Any]]:
+    config_data = load_json(config_data_path)
+    csv_path = config_data["data_csv_path"]
+    # Match loader.py: empty cells remain "" instead of becoming NaN.
+    df = pd.read_csv(
+        csv_path,
+        keep_default_na=False,
+        na_filter=False,
+        low_memory=False,
+    )
+    chosen_row_index = choose_row_index(
+        num_rows=len(df),
+        row_index=row_index,
+        seed=seed,
+    )
+    row = {
+        str(k): to_jsonable(v)
+        for k, v in df.iloc[chosen_row_index].to_dict().items()
+    }
+    cat_columns = get_categorical_columns(config_data)
+    numeric_columns = get_numeric_columns(config_data)
+    vision = get_vision_input(config_data, row)
+    unmasked_card = create_unmasked_card(
+        row=row,
+        cat_columns=cat_columns,
+        numeric_columns=numeric_columns,
+        vision=vision,
+    )
+    masked_card = create_masked_card(
+        unmasked_card=unmasked_card,
+        cat_mask_ratio=cat_mask_ratio,
+        num_mask_ratio=num_mask_ratio,
+        seed=seed,
+    )
+    return unmasked_card, masked_card
+def save_json_pretty(obj: Dict[str, Any], path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)
+        f.write("\n")
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Create readable/editable SoilFormer input cards from one CSV row."
+    )
+    parser.add_argument(
+        "--config_data",
+        type=str,
+        default="config/config_data.json",
+        help="Path to config_data.json. Default: config/config_data.json",
+    )
+    parser.add_argument(
+        "--row_index",
+        type=str,
+        default=None,
+        help="CSV row index. Use None/null/random or omit for a random row.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Given output name. Writes given_name__unmasked.json and given_name__masked.json.",
+    )
+    parser.add_argument(
+        "--cat_mask_ratio",
+        type=float,
+        default=0.15,
+        help="Ratio of non-missing categorical features to mask. Default: 0.15",
+    )
+    parser.add_argument(
+        "--num_mask_ratio",
+        type=float,
+        default=0.15,
+        help="Ratio of non-missing numeric features to mask. Default: 0.15",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Seed for random row selection and feature masking. Default: 42",
+    )
+    args = parser.parse_args()
+    cat_mask_ratio = validate_ratio("cat_mask_ratio", args.cat_mask_ratio)
+    num_mask_ratio = validate_ratio("num_mask_ratio", args.num_mask_ratio)
+    unmasked_card, masked_card = create_cards(
+        config_data_path=args.config_data,
+        row_index=parse_optional_int(args.row_index),
+        seed=args.seed,
+        cat_mask_ratio=cat_mask_ratio,
+        num_mask_ratio=num_mask_ratio,
+    )
+    unmasked_path, masked_path = output_paths_from_given_name(args.output)
+    save_json_pretty(unmasked_card, unmasked_path)
+    save_json_pretty(masked_card, masked_path)
+    print(
+        json.dumps(
+            {
+                "status": "ok",
+                "unmasked_output": str(unmasked_path),
+                "masked_output": str(masked_path),
+            },
+            ensure_ascii=False,
+        )
+    )
+if __name__ == "__main__":
+    main()

inference_predict_output_card.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import argparse
+import ast
+import json
+import sys
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+from urllib.parse import urljoin
+import numpy as np
+import pandas as pd
+import requests
+import torch
+from PIL import Image
+from torchvision import transforms
+# The script is intended to live one level above ./modelling.
+# modelling/ modules still contain some legacy absolute imports, so expose the
+# modelling directory on sys.path as well.
+PROJECT_ROOT = Path(__file__).resolve().parent
+MODELLING_DIR = PROJECT_ROOT / "modelling"
+if str(MODELLING_DIR) not in sys.path:
+    sys.path.insert(0, str(MODELLING_DIR))
+from modelling.soilformer import SoilFormer  # noqa: E402
+from modelling.utils import get_dtype, load_json  # noqa: E402
+# -----------------------------------------------------------------------------
+# JSON helpers
+# -----------------------------------------------------------------------------
+def load_card(path: str) -> Dict[str, Any]:
+    with open(path, "r", encoding="utf-8") as f:
+        obj = json.load(f)
+    if not isinstance(obj, dict):
+        raise ValueError(f"Card must be a JSON object: {path}")
+    return obj
+def save_json_pretty(obj: Dict[str, Any], path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)
+        f.write("\n")
+def to_jsonable(x: Any) -> Any:
+    if isinstance(x, np.generic):
+        return x.item()
+    if isinstance(x, np.ndarray):
+        return x.tolist()
+    if isinstance(x, torch.Tensor):
+        x = x.detach().cpu()
+        if x.ndim == 0:
+            return x.item()
+        return x.tolist()
+    if isinstance(x, dict):
+        return {str(k): to_jsonable(v) for k, v in x.items()}
+    if isinstance(x, (list, tuple)):
+        return [to_jsonable(v) for v in x]
+    return x
+# -----------------------------------------------------------------------------
+# Runtime / model loading
+# -----------------------------------------------------------------------------
+def resolve_device(device_str: str) -> torch.device:
+    device_str = str(device_str).lower()
+    if device_str == "auto":
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        if torch.backends.mps.is_available():
+            return torch.device("mps")
+        return torch.device("cpu")
+    if device_str == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError("--device cuda requested, but CUDA is not available")
+        return torch.device("cuda")
+    if device_str == "mps":
+        if not torch.backends.mps.is_available():
+            raise RuntimeError("--device mps requested, but MPS is not available")
+        return torch.device("mps")
+    if device_str == "cpu":
+        return torch.device("cpu")
+    raise ValueError(f"Unsupported device: {device_str}")
+def load_model(args: argparse.Namespace, config_model: Dict[str, Any], device: torch.device, dtype: torch.dtype) -> SoilFormer:
+    print("[INFO] Initializing model...")
+    model = SoilFormer(config=config_model, device=str(device))
+    print("[INFO] Loading checkpoint...")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    missing, unexpected = model.load_state_dict(
+        checkpoint["model_state_dict"], strict=False
+    )
+    non_vision_missing = [k for k in missing if not k.startswith("vision_extractor.")]
+    if len(non_vision_missing) > 0:
+        raise RuntimeError(
+            f"[ERROR] Missing non-vision keys detected: {non_vision_missing[:10]}"
+        )
+    print(f"[INFO] Missing keys (vision only): {len(missing)}")
+    print(f"[INFO] Unexpected keys: {len(unexpected)}")
+    model.to(device=device, dtype=dtype)
+    model.eval()
+    return model
+# -----------------------------------------------------------------------------
+# Metadata loading
+# -----------------------------------------------------------------------------
+def load_metadata(config_data: Dict[str, Any]) -> Dict[str, Any]:
+    cat_vocab = load_json(config_data["cat_vocab_path"])
+    numeric_vocab = load_json(config_data["numeric_vocab_path"])
+    stats_df = pd.read_csv(config_data["numeric_stats_path"])
+    numeric_stats = {}
+    for _, row in stats_df.iterrows():
+        col = row["column"]
+        mean = float(row["mean"])
+        std = float(row["std"])
+        if std == 0.0:
+            std = 1.0
+        numeric_stats[str(col)] = (mean, std)
+    cat_columns = list(cat_vocab.keys())
+    cat_mask_local_ids = [int(cat_vocab[col]["mask_local_id"]) for col in cat_columns]
+    id_to_label_by_col = {}
+    for col in cat_columns:
+        label2id = cat_vocab[col]["label2id"]
+        id_to_label_by_col[col] = {int(v): str(k) for k, v in label2id.items()}
+    return {
+        "cat_vocab": cat_vocab,
+        "numeric_vocab": numeric_vocab,
+        "numeric_stats": numeric_stats,
+        "cat_columns": cat_columns,
+        "cat_mask_local_ids": cat_mask_local_ids,
+        "id_to_label_by_col": id_to_label_by_col,
+    }
+# -----------------------------------------------------------------------------
+# Image handling, matching loader.py behavior
+# -----------------------------------------------------------------------------
+class CenterSquareCrop:
+    def __call__(self, img: Image.Image) -> Image.Image:
+        w, h = img.size
+        if w == h:
+            return img
+        if w > h:
+            left = (w - h) // 2
+            return img.crop((left, 0, left + h, h))
+        top = (h - w) // 2
+        return img.crop((0, top, w, top + w))
+def build_image_transform(image_size: int):
+    return transforms.Compose([
+        CenterSquareCrop(),
+        transforms.Resize((image_size, image_size)),
+        transforms.ToTensor(),
+    ])
+def join_photo_root(photo_root: str, relative_path: str) -> str:
+    if photo_root.startswith("http://") or photo_root.startswith("https://"):
+        return urljoin(photo_root.rstrip("/") + "/", relative_path)
+    return photo_root.rstrip("/") + "/" + relative_path.lstrip("/")
+def load_image_tensor(image_path: str, image_size: int) -> torch.Tensor:
+    if image_path.startswith("http://") or image_path.startswith("https://"):
+        resp = requests.get(image_path, timeout=(3, 10))
+        resp.raise_for_status()
+        img = Image.open(BytesIO(resp.content)).convert("RGB")
+    else:
+        img = Image.open(image_path).convert("RGB")
+    return build_image_transform(image_size)(img)
+# -----------------------------------------------------------------------------
+# Tensorization from readable input card
+# -----------------------------------------------------------------------------
+def is_masked_or_missing(value: Any) -> bool:
+    return value is None or value == ""
+def parse_numeric_card_value(value: Any, n_in: int) -> Tuple[list[float], bool]:
+    if value is None or value == "":
+        return [0.0] * n_in, False
+    if n_in == 1:
+        if isinstance(value, list):
+            if len(value) != 1:
+                raise ValueError(f"Expected scalar or length-1 list for n_in=1, got {value!r}")
+            return [float(value[0])], True
+        return [float(value)], True
+    if isinstance(value, str):
+        parsed = ast.literal_eval(value)
+    else:
+        parsed = value
+    if not isinstance(parsed, (list, tuple)):
+        raise ValueError(f"Expected list-like numeric vector for n_in={n_in}, got {value!r}")
+    if len(parsed) != n_in:
+        raise ValueError(f"Numeric vector length mismatch: expected {n_in}, got {len(parsed)}")
+    return [float(v) for v in parsed], True
+def tensorize_card(
+    input_card: Dict[str, Any],
+    config_data: Dict[str, Any],
+    meta: Dict[str, Any],
+) -> Dict[str, Any]:
+    categorical = input_card.get("categorical", {})
+    numeric = input_card.get("numeric", {})
+    vision = input_card.get("vision", {})
+    if not isinstance(categorical, dict):
+        raise ValueError("input_card['categorical'] must be an object")
+    if not isinstance(numeric, dict):
+        raise ValueError("input_card['numeric'] must be an object")
+    if not isinstance(vision, dict):
+        vision = {}
+    # Categorical: raw label -> local id, null/"" -> mask id and invalid.
+    cat_ids = []
+    cat_valids = []
+    for col, mask_id in zip(meta["cat_columns"], meta["cat_mask_local_ids"]):
+        value = categorical.get(col, "")
+        if is_masked_or_missing(value):
+            cat_ids.append(mask_id)
+            cat_valids.append(False)
+        else:
+            label2id = meta["cat_vocab"][col]["label2id"]
+            if value not in label2id:
+                raise KeyError(f"Unknown categorical value: column={col}, value={value!r}")
+            cat_ids.append(int(label2id[value]))
+            cat_valids.append(True)
+    cat_local_ids = torch.tensor([cat_ids], dtype=torch.long)
+    cat_valid_positions = torch.tensor([cat_valids], dtype=torch.bool)
+    # Numeric: raw actual units -> z-score grouped tensors.
+    numeric_values_by_nin = {}
+    numeric_valid_positions_by_nin = {}
+    for group in meta["numeric_vocab"]["groups"]:
+        n_in = int(group["n_in"])
+        values = []
+        valids = []
+        for feat in group["feature_names"]:
+            feat = str(feat)
+            raw_value = numeric.get(feat, "")
+            parsed, is_valid = parse_numeric_card_value(raw_value, n_in)
+            if is_valid:
+                mean, std = meta["numeric_stats"][feat]
+                parsed = [(v - mean) / std for v in parsed]
+            values.append(parsed)
+            valids.append(is_valid)
+        numeric_values_by_nin[n_in] = torch.tensor([values], dtype=torch.float32)
+        numeric_valid_positions_by_nin[n_in] = torch.tensor([valids], dtype=torch.bool)
+    # Vision: readable card stores suffix only. Load/transform here.
+    image_size = int(config_data["image_size"])
+    image_path_suffix = vision.get("image_path_suffix", "")
+    if image_path_suffix is None or image_path_suffix == "":
+        pixel_values = torch.zeros(1, 3, image_size, image_size, dtype=torch.float32)
+        vision_valid_positions = torch.tensor([False], dtype=torch.bool)
+    else:
+        image_path = join_photo_root(str(config_data["photo_root"]), str(image_path_suffix))
+        try:
+            image = load_image_tensor(image_path, image_size=image_size)
+            pixel_values = image.unsqueeze(0)
+            vision_valid_positions = torch.tensor([True], dtype=torch.bool)
+        except Exception as exc:
+            print(f"[WARN] Could not load image; using zero vision input: {exc}")
+            pixel_values = torch.zeros(1, 3, image_size, image_size, dtype=torch.float32)
+            vision_valid_positions = torch.tensor([False], dtype=torch.bool)
+    return {
+        "cat_local_ids": cat_local_ids,
+        "cat_valid_positions": cat_valid_positions,
+        "numeric_values_by_nin": numeric_values_by_nin,
+        "numeric_valid_positions_by_nin": numeric_valid_positions_by_nin,
+        "pixel_values": pixel_values,
+        "vision_valid_positions": vision_valid_positions,
+    }
+def move_batch_to_device(batch: Dict[str, Any], device: torch.device, dtype: torch.dtype) -> Dict[str, Any]:
+    out = {}
+    for key, value in batch.items():
+        if isinstance(value, torch.Tensor):
+            if value.dtype.is_floating_point:
+                out[key] = value.to(device=device, dtype=dtype)
+            else:
+                out[key] = value.to(device=device)
+        elif isinstance(value, dict):
+            sub = {}
+            for k, v in value.items():
+                if isinstance(v, torch.Tensor):
+                    if v.dtype.is_floating_point:
+                        sub[k] = v.to(device=device, dtype=dtype)
+                    else:
+                        sub[k] = v.to(device=device)
+                else:
+                    sub[k] = v
+            out[key] = sub
+        else:
+            out[key] = value
+    return out
+# -----------------------------------------------------------------------------
+# Decoding model outputs to readable card
+# -----------------------------------------------------------------------------
+def denormalize_numeric(values_z: list[float], mean: float, std: float) -> list[float]:
+    return [float(v) * float(std) + float(mean) for v in values_z]
+def decode_outputs(
+    cat_logits_padded: torch.Tensor,
+    valid_class_mask: torch.Tensor,
+    value_by_nin: Dict[int, torch.Tensor],
+    meta: Dict[str, Any],
+) -> Dict[str, Any]:
+    cat_logits = cat_logits_padded.detach().float().cpu()
+    valid_class_mask = valid_class_mask.detach().cpu().bool()
+    categorical_out = {}
+    for m, col in enumerate(meta["cat_columns"]):
+        cm = int(valid_class_mask[m].sum().item())
+        logits = cat_logits[0, m, :cm]
+        probs = torch.softmax(logits, dim=-1)
+        pred_id = int(torch.argmax(probs).item())
+        pred_label = meta["id_to_label_by_col"][col].get(pred_id, str(pred_id))
+        categorical_out[col] = pred_label
+    numeric_out = {}
+    for group in meta["numeric_vocab"]["groups"]:
+        n_in = int(group["n_in"])
+        preds_z = value_by_nin[n_in].detach().float().cpu()[0]  # [V, n_in]
+        for v_idx, feat in enumerate(group["feature_names"]):
+            feat = str(feat)
+            mean, std = meta["numeric_stats"][feat]
+            raw_pred_values = denormalize_numeric(preds_z[v_idx].tolist(), mean, std)
+            if n_in == 1:
+                numeric_out[feat] = raw_pred_values[0]
+            else:
+                numeric_out[feat] = raw_pred_values
+    return {
+        "categorical": categorical_out,
+        "numeric": numeric_out,
+    }
+# -----------------------------------------------------------------------------
+# Accuracy / MAE analysis
+# -----------------------------------------------------------------------------
+def masked_feature_names(input_card: Dict[str, Any], section: str) -> list[str]:
+    values = input_card.get(section, {})
+    if not isinstance(values, dict):
+        return []
+    return [k for k, v in values.items() if v is None]
+def numeric_abs_errors(pred_value: Any, answer_value: Any) -> list[float]:
+    if answer_value is None or answer_value == "":
+        return []
+    if pred_value is None or pred_value == "":
+        return []
+    if isinstance(answer_value, str):
+        s = answer_value.strip()
+        if s == "":
+            return []
+        if s.startswith("[") and s.endswith("]"):
+            answer_value = [float(x) for x in ast.literal_eval(s)]
+        else:
+            answer_value = float(s)
+    if isinstance(pred_value, str):
+        s = pred_value.strip()
+        if s.startswith("[") and s.endswith("]"):
+            pred_value = [float(x) for x in ast.literal_eval(s)]
+        else:
+            pred_value = float(s)
+    if isinstance(answer_value, (list, tuple)):
+        if not isinstance(pred_value, (list, tuple)):
+            return []
+        if len(pred_value) != len(answer_value):
+            return []
+        return [abs(float(p) - float(a)) for p, a in zip(pred_value, answer_value)]
+    return [abs(float(pred_value) - float(answer_value))]
+def evaluate_against_answer(
+    input_card: Dict[str, Any],
+    output_card: Dict[str, Any],
+    answer_card: Dict[str, Any],
+) -> Dict[str, Any]:
+    cat_masked = masked_feature_names(input_card, "categorical")
+    num_masked = masked_feature_names(input_card, "numeric")
+    cat_details = {}
+    correct = 0
+    total = 0
+    for feat in cat_masked:
+        answer = answer_card.get("categorical", {}).get(feat)
+        pred = output_card.get("categorical", {}).get(feat)
+        if answer is None or answer == "":
+            continue
+        is_correct = pred == answer
+        cat_details[feat] = {
+            "predicted": pred,
+            "answer": answer,
+            "correct": bool(is_correct),
+        }
+        correct += int(is_correct)
+        total += 1
+    num_details = {}
+    abs_errors_all = []
+    for feat in num_masked:
+        answer = answer_card.get("numeric", {}).get(feat)
+        pred = output_card.get("numeric", {}).get(feat)
+        errors = numeric_abs_errors(pred, answer)
+        if not errors:
+            continue
+        mae = sum(errors) / len(errors)
+        num_details[feat] = {
+            "predicted": pred,
+            "answer": answer,
+            "absolute_error": errors[0] if len(errors) == 1 else errors,
+            "mae": mae,
+        }
+        abs_errors_all.extend(errors)
+    return {
+        "categorical": {
+            "accuracy": None if total == 0 else correct / total,
+            "correct": correct,
+            "total": total,
+            "details": cat_details,
+        },
+        "numeric": {
+            "mae": None if len(abs_errors_all) == 0 else sum(abs_errors_all) / len(abs_errors_all),
+            "count": len(abs_errors_all),
+            "details": num_details,
+        },
+        "note": "Metrics are computed only on fields that are null in input_card. Natural missing values \"\" are ignored.",
+    }
+def acc_path_from_output(output: str) -> Path:
+    path = Path(output)
+    if path.suffix == ".json":
+        base = path.with_suffix("")
+    else:
+        base = path
+    return base.with_name(base.name + "__acc.json")
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run SoilFormer inference from a readable input card.")
+    parser.add_argument("--input_card", type=str, required=True)
+    parser.add_argument("--output", type=str, required=True)
+    parser.add_argument("--answer_card", type=str, default=None)
+    parser.add_argument("--checkpoint", type=str, required=True)
+    parser.add_argument("--config_data", type=str, default="config/config_data.json")
+    parser.add_argument("--config_model", type=str, default="config/config_model.json")
+    parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "mps", "cpu"])
+    args = parser.parse_args()
+    config_data = load_json(args.config_data)
+    config_model = load_json(args.config_model)
+    dtype = get_dtype(config_model.get("dtype", "bfloat16"))
+    device = resolve_device(args.device)
+    meta = load_metadata(config_data)
+    input_card = load_card(args.input_card)
+    batch = tensorize_card(input_card=input_card, config_data=config_data, meta=meta)
+    batch = move_batch_to_device(batch, device=device, dtype=dtype)
+    model = load_model(args=args, config_model=config_model, device=device, dtype=dtype)
+    with torch.no_grad():
+        cat_logits_padded, cat_s, valid_class_mask, value_by_nin, s_by_nin, _ = model(
+            cat_local_ids=batch["cat_local_ids"],
+            numeric_values_by_nin=batch["numeric_values_by_nin"],
+            cat_valid_positions=batch["cat_valid_positions"],
+            numeric_valid_positions_by_nin=batch["numeric_valid_positions_by_nin"],
+            pixel_values=batch["pixel_values"],
+            vision_valid_positions=batch["vision_valid_positions"],
+        )
+    output_card = decode_outputs(
+        cat_logits_padded=cat_logits_padded,
+        valid_class_mask=valid_class_mask,
+        value_by_nin=value_by_nin,
+        meta=meta,
+    )
+    save_json_pretty(to_jsonable(output_card), Path(args.output))
+    result = {"status": "ok", "output": args.output}
+    if args.answer_card:
+        answer_card = load_card(args.answer_card)
+        acc_card = evaluate_against_answer(
+            input_card=input_card,
+            output_card=output_card,
+            answer_card=answer_card,
+        )
+        acc_path = acc_path_from_output(args.output)
+        save_json_pretty(to_jsonable(acc_card), acc_path)
+        result["acc_output"] = str(acc_path)
+    print(json.dumps(result, ensure_ascii=False))
+if __name__ == "__main__":
+    main()

model_weights/gemma3n_E2B_vision_only/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df49c2835315d4de6753bea989198e66157d84aa831738227f3bc705eab2d746
+size 4455

model_weights/gemma3n_E2B_vision_only/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eed8742f2e68b0d28bac29ee591a97e6738b6d040e0a5b69d270fca1d1453e20
+size 597245920

model_weights/gemma3n_E2B_vision_only/modeling_gemma3n.py ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78b0b5d14177913d7956279f7a08b62f45f5b0ca6ab1993507fc653ad9579b0c
+size 114392

model_weights/gemma3n_E2B_vision_only/processor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f52ae9fb2eeed632fc99f14fa8b4405b17cd4b760a369cddf366f9ccf6855b
+size 2262

model_weights/gemma3n_E2B_vision_only/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fad9b5f6f930b43d292eb3c56c176a69292850ddd0abc02d9ea1dac3292c87a
+size 33442428

model_weights/gemma3n_E2B_vision_only/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10c688d1767007b8614f275427198205507d941aefa6ae63c3e429ef87de7999
+size 936

model_weights/gemma3n_E2B_vision_only/vision_extractor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea31eaf2aec2df075d62a4bca2209763e97a0141122257b07e62fe79e3cf4564
+size 156

model_weights/soilformer_pretrain/hetero_epoch_200.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:057cd623e72bbf477bd46f346506acfd5741c2b57326d2bc73e723ac3ea949fc
+size 276126967

modelling/__init__.py ADDED Viewed

File without changes

modelling/decode_categorical.py ADDED Viewed

	@@ -0,0 +1,423 @@

+# decode_categorical.py
+# -*- coding: utf-8 -*-
+"""
+Categorical decoder for tabular transformer.
+Design (column-wise heads):
+- Each categorical column corresponds to exactly 1 token.
+- Each column has its own classifier head:
+    hidden_size -> num_classes[col]
+  Optionally with a small MLP:
+    hidden_size -> middle_size -> num_classes[col]
+No loss is included here (caller will apply CrossEntropyLoss).
+"""
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from utils import load_json, GroupedMLP
+# ============================================================
+# Small head builder
+# ============================================================
+def _make_head(
+        hidden_size: int,
+        num_classes: int,
+        middle_size: Optional[int],
+        bias: bool = True,
+) -> nn.Module:
+    """
+    Build a lightweight per-column classifier head.
+    """
+    if middle_size is None:
+        return nn.Linear(hidden_size, num_classes, bias=bias)
+    return nn.Sequential(
+        nn.Linear(hidden_size, middle_size, bias=bias),
+        nn.GELU(),
+        nn.Linear(middle_size, num_classes, bias=bias),
+    )
+# ============================================================
+# Decoder
+# ============================================================
+class CategoricalDecoder(nn.Module):
+    """
+    Column-wise categorical decoder.
+    Design:
+    - Each categorical column corresponds to exactly one token.
+    - Each column has its own classifier head:
+          hidden_size -> num_classes[col]
+      Optionally with a small MLP:
+          hidden_size -> middle_size -> num_classes[col]
+    - In addition, the decoder predicts a per-sample, per-column
+      log-variance term `s` used for heteroscedastic loss weighting.
+    Input:
+        x_cat_tokens: [B, M, H]
+            B = batch size
+            M = number of categorical columns (ordered by col_id)
+            H = hidden size
+    Outputs:
+    Case 1 (return_padded=False):
+        logits_list: List[Tensor] length M
+            logits_list[m]: [B, num_classes[m]]
+        s: [B, M]
+            Predicted log-variance per sample and column:
+                s[b, m] = log sigma^2_{b,m}
+            Intended for heteroscedastic loss weighting.
+    Case 2 (return_padded=True):
+        logits_padded: [B, M, Cmax]
+            Logits padded to the maximum class count across columns.
+        s: [B, M]
+            Same uncertainty prediction as above.
+        valid_mask: [M, Cmax]
+            True for valid class indices for each column.
+    """
+    def __init__(
+            self,
+            hidden_size: int,
+            cat_vocab_json: str,
+            middle_size: Optional[int] = None,
+            bias: bool = True,
+            homoscedastic: bool = True,
+    ):
+        super().__init__()
+        spec = load_json(cat_vocab_json)
+        items = sorted(spec.items(), key=lambda x: x[1]["col_id"])
+        col_ids: List[int] = []
+        num_classes: List[int] = []
+        for _, val in items:
+            col_ids.append(int(val["col_id"]))
+            num_classes.append(int(val["num_classes"]))
+        self.hidden_size = int(hidden_size)
+        self.num_cols = len(num_classes)
+        self.middle_size = middle_size
+        self.homoscedastic = bool(homoscedastic)
+        # Buffers for debugging / validation / optional padded output
+        self.register_buffer("cat_col_ids", torch.tensor(col_ids, dtype=torch.long), persistent=True)  # [M]
+        self.register_buffer("num_classes", torch.tensor(num_classes, dtype=torch.long), persistent=True)  # [M]
+        # Build per-column heads
+        heads = []
+        for c in num_classes:
+            head = _make_head(self.hidden_size, c, middle_size, bias=bias)
+            heads.append(head)
+        self.heads = nn.ModuleList(heads)
+        if self.homoscedastic:
+            self.s_param = nn.Parameter(torch.zeros(self.num_cols))
+            self.s_head = None
+        else:
+            self.s_head = GroupedMLP(
+                n_var=self.num_cols,
+                n_in=self.hidden_size,
+                n_out=1,
+                middle_size=self.middle_size,
+            )
+            self.s_param = None
+    def init_weights(self, std: float = 0.02):
+        for head in self.heads:
+            for module in head.modules():
+                if isinstance(module, nn.Linear):
+                    nn.init.normal_(module.weight, std=std)
+                    if module.bias is not None:
+                        nn.init.zeros_(module.bias)
+        if self.homoscedastic:
+            nn.init.zeros_(self.s_param)
+        else:
+            self.s_head.init_weights(std=0.0)
+    def _check_input(self, x_cat_tokens: torch.Tensor) -> Tuple[int, int, int]:
+        if x_cat_tokens.dim() != 3:
+            raise ValueError(f"x_cat_tokens must be [B,M,H], got {tuple(x_cat_tokens.shape)}")
+        B, M, H = x_cat_tokens.shape
+        if H != self.hidden_size:
+            raise ValueError(f"hidden_size mismatch: got {H}, expected {self.hidden_size}")
+        if M != self.num_cols:
+            raise ValueError(f"categorical token count mismatch: got M={M}, expected {self.num_cols}")
+        return B, M, H
+    @torch.no_grad()
+    def _build_valid_mask(self, device: torch.device) -> torch.Tensor:
+        """
+        valid_mask[m, j] = True iff j < num_classes[m]
+        """
+        M = self.num_cols
+        cmax = int(self.num_classes.max().item())
+        ar = torch.arange(cmax, device=device).view(1, cmax).expand(M, cmax)
+        nc = self.num_classes.view(M, 1).expand(M, cmax)
+        return ar < nc
+    def forward(
+            self,
+            x_cat_tokens: torch.Tensor,
+            return_padded: bool = False,
+            pad_value: Optional[float] = None,
+    ) -> Union[
+        Tuple[List[torch.Tensor], torch.Tensor],
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ]:
+        """
+        Args:
+            x_cat_tokens: [B, M, H]
+                B = batch size
+                M = number of categorical columns
+                H = hidden size (per-column token embedding dim)
+            return_padded:
+                False:
+                    return (logits_list, s)
+                True:
+                    return (logits_padded, s, valid_mask)
+            pad_value:
+                Value used to fill invalid class positions in padded logits.
+        Returns:
+            Case 1 (return_padded=False):
+                logits_list: List length M
+                    logits_list[m]: [B, C_m]
+                s: [B, M]
+                    s[b, m] = log sigma^2 for sample b, column m
+            Case 2 (return_padded=True):
+                logits_padded: [B, M, Cmax]
+                s:             [B, M]
+                valid_mask:    [M, Cmax]
+        """
+        # --------------------------------------------------------
+        # 1) Basic shape validation
+        # --------------------------------------------------------
+        # Ensures x_cat_tokens is [B,M,H] and matches decoder config
+        B, M, _ = self._check_input(x_cat_tokens)
+        # --------------------------------------------------------
+        # 2) Per-column categorical logits
+        # --------------------------------------------------------
+        # We still use per-column heads because each column
+        # can have a different number of classes C_m.
+        #
+        # logits_list[m] shape: [B, C_m]
+        logits_list: List[torch.Tensor] = []
+        for m in range(M):
+            # x_cat_tokens[:, m, :] -> [B,H]
+            # heads[m] maps H -> C_m
+            logits_m = self.heads[m](x_cat_tokens[:, m, :])
+            logits_list.append(logits_m)
+        # --------------------------------------------------------
+        # 3) Sample-wise & column-wise uncertainty (log-variance)
+        # --------------------------------------------------------
+        # s_head processes all columns at once (grouped, no loop)
+        #
+        # Input:  [B,M,H]
+        # Output: [B,M]
+        #
+        # s[b,m] = log(sigma_{b,m}^2)
+        if self.homoscedastic:
+            s = self.s_param.unsqueeze(0).expand(B, -1)
+        else:
+            s = self.s_head(x_cat_tokens).squeeze(-1)
+        # --------------------------------------------------------
+        # 4) If no padded output requested
+        # --------------------------------------------------------
+        if not return_padded:
+            # Return:
+            #   logits_list: List of length M
+            #   s:           [B,M]
+            return logits_list, s
+        # --------------------------------------------------------
+        # 5) Build padded logits tensor
+        # --------------------------------------------------------
+        # We unify different C_m into a common Cmax.
+        #
+        # logits_padded shape: [B,M,Cmax]
+        cmax = int(self.num_classes.max().item())
+        if pad_value is None:
+            pad_value = torch.finfo(x_cat_tokens.dtype).min
+        logits_padded = torch.full(
+            (B, M, cmax),
+            pad_value,
+            device=x_cat_tokens.device,
+            dtype=x_cat_tokens.dtype,
+        )
+        # Fill valid class positions per column
+        for m in range(M):
+            cm = logits_list[m].size(-1)  # C_m
+            logits_padded[:, m, :cm] = logits_list[m]
+        # --------------------------------------------------------
+        # 6) Build validity mask
+        # --------------------------------------------------------
+        # valid_mask[m,j] = True  if j < C_m
+        #                  = False otherwise
+        #
+        # Shape: [M, Cmax]
+        valid_class_mask = self._build_valid_mask(device=x_cat_tokens.device)
+        # --------------------------------------------------------
+        # 7) Return padded outputs
+        # --------------------------------------------------------
+        return logits_padded, s, valid_class_mask
+# ============================================================
+# DEMO
+# ============================================================
+def _demo_main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--cat_vocab_json", type=str, default="data/cat_vocab.json")
+    parser.add_argument("--hidden_size", type=int, default=768)
+    parser.add_argument("--middle_size", type=int, default=None)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--dtype", type=str, default="float32", choices=["float16", "bfloat16", "float32"])
+    args = parser.parse_args()
+    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    dtype_map = {
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "float32": torch.float32,
+    }
+    dtype = dtype_map[args.dtype]
+    # --------------------------------------------------------
+    # Load vocab spec
+    # --------------------------------------------------------
+    spec = load_json(args.cat_vocab_json)
+    items = sorted(spec.items(), key=lambda x_: x_[1]["col_id"])
+    M = len(items)
+    B = args.batch_size
+    H = args.hidden_size
+    num_classes = [int(s["num_classes"]) for _, s in items]
+    print("===== Categorical Columns =====")
+    for i, (name, s) in enumerate(items):
+        print(f"{i:03d}  {name:20s}  classes={s['num_classes']}")
+    print()
+    # --------------------------------------------------------
+    # Build model
+    # --------------------------------------------------------
+    model = CategoricalDecoder(
+        hidden_size=args.hidden_size,
+        cat_vocab_json=args.cat_vocab_json,
+        middle_size=args.middle_size,
+    ).to(device=device, dtype=dtype)
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Model parameters: {total_params:,} (trainable: {trainable_params:,})")
+    print()
+    # --------------------------------------------------------
+    # Fake input tokens
+    # --------------------------------------------------------
+    x = torch.randn(B, M, H, device=device, dtype=dtype)
+    print("Input tokens shape:", tuple(x.shape))
+    print()
+    # --------------------------------------------------------
+    # Case 1: logits_list
+    # --------------------------------------------------------
+    print("===== Forward: logits_list mode =====")
+    with torch.no_grad():
+        logits_list, s = model(x, return_padded=False)
+    for m, (name, spec_item) in enumerate(items):
+        C = spec_item["num_classes"]
+        print(f"{m:03d} {name:20s} logits:", tuple(logits_list[m].shape), f"(expected {(B, C)})")
+    print("s shape:", tuple(s.shape))
+    print()
+    # --------------------------------------------------------
+    # Case 2: padded logits
+    # --------------------------------------------------------
+    print("===== Forward: padded mode =====")
+    with torch.no_grad():
+        logits_padded, s2, valid_mask = model(x, return_padded=True)
+    print("logits_padded:", tuple(logits_padded.shape))
+    print("s:", tuple(s2.shape))
+    print("valid_mask:", tuple(valid_mask.shape))
+    print()
+    # --------------------------------------------------------
+    # Visualize valid mask
+    # --------------------------------------------------------
+    print("===== Valid class mask (first 10 columns) =====")
+    cols_to_show = min(10, M)
+    for m in range(cols_to_show):
+        cm = num_classes[m]
+        valid = valid_mask[m].sum().item()
+        print(f"col {m:02d}  num_classes={cm}  valid_mask_sum={valid}")
+    print()
+    # --------------------------------------------------------
+    # Check padded logits correctness
+    # --------------------------------------------------------
+    print("===== Padded logits sanity check =====")
+    for m in range(cols_to_show):
+        cm = num_classes[m]
+        valid_region = logits_padded[:, m, :cm]
+        padded_region = logits_padded[:, m, cm:]
+        print(f"col {m:02d} valid region shape:", tuple(valid_region.shape))
+        if padded_region.numel() > 0:
+            print(f"col {m:02d} padded region mean:", padded_region.mean().item())
+    print()
+    print("Demo finished successfully.")
+if __name__ == "__main__":
+    _demo_main()

modelling/decode_numeric.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# decode_numeric.py
+# -*- coding: utf-8 -*-
+"""
+Numeric decoder module for tabular transformer.
+Symmetric to embed_numeric.py (bucketed by n_in):
+- For each bucket (same n_in), we decode tokens without a Python for-loop over columns.
+- Uses a batched per-variable MLP with per-column parameters (NOT shared across V).
+Input:
+    x_tokens: [B, total_numeric_tokens, H]
+        token order must match numeric_vocab.json:
+            groups by n_in ascending, within group by feature name,
+            and within each feature: n_in tokens.
+Output:
+    values_by_nin: Dict[int, Tensor]
+        n_in -> x_hat [B, V, n_in]
+middle_size:
+    - None: 1-layer per-variable Linear
+    - int : 2-layer per-variable MLP (Linear -> GELU -> Linear)
+"""
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+from utils import GroupedMLP, load_json
+class NumericDecoder(nn.Module):
+    """
+    Decode numeric tokens back to numeric values, bucketed by n_in.
+    Input:
+        x_tokens: [B, total_numeric_tokens, H]
+    Output:
+        values_by_nin:
+            n_in -> y_hat [B, V, n_in]
+        s_by_nin:
+            n_in -> s [B, V]
+            where s = log(sigma^2), shared across the n_in dimensions
+            of each variable, intended for heteroscedastic loss computation.
+    """
+    def __init__(
+            self,
+            hidden_size: int,
+            numeric_vocab_json: str,
+            middle_size: Optional[int] = None,
+            homoscedastic: bool = True,
+    ):
+        super().__init__()
+        self.hidden_size = int(hidden_size)
+        self.middle_size = None if middle_size is None else int(middle_size)
+        self.homoscedastic = bool(homoscedastic)
+        spec = load_json(numeric_vocab_json)
+        self.groups: List[Dict] = list(spec["groups"])
+        self.total_numeric_tokens = int(spec["total_numeric_tokens"])
+        self.group_token_offsets: Dict[str, int] = dict(spec.get("group_token_offsets", {}))
+        self.group_v_decoders = nn.ModuleList()
+        self.group_s_decoders = nn.ModuleList()
+        self.group_nins: List[int] = []
+        self.group_Vs: List[int] = []
+        for g in self.groups:
+            n_in = int(g["n_in"])
+            names = list(g["feature_names"])
+            V = len(names)
+            self.group_nins.append(n_in)  # noqa
+            self.group_Vs.append(V)
+            # value decoder: [B,V,n_in*H] -> [B,V,n_in]
+            self.group_v_decoders.append(
+                GroupedMLP(
+                    n_var=V,
+                    n_in=n_in * self.hidden_size,
+                    n_out=n_in,
+                    middle_size=self.middle_size,
+                )
+            )
+            # uncertainty decoder: [B,V,H] -> [B,V,1] -> [B,V]
+            if not self.homoscedastic:
+                self.group_s_decoders.append(
+                    GroupedMLP(
+                        n_var=V,
+                        n_in=self.hidden_size,
+                        n_out=1,
+                        middle_size=self.middle_size,
+                    )
+                )
+        if self.homoscedastic:
+            self.group_s_params = nn.ParameterList(
+                [nn.Parameter(torch.zeros(V)) for V in self.group_Vs]
+            )
+        else:
+            self.group_s_params = None
+        # spec integrity check
+        running = 0
+        for g in self.groups:
+            n_in = int(g["n_in"])
+            V = len(g["feature_names"])
+            key = str(n_in)
+            if key not in self.group_token_offsets:
+                raise ValueError(f"Missing group_token_offsets entry for n_in={n_in}")
+            if int(self.group_token_offsets[key]) != running:
+                raise ValueError(
+                    f"group_token_offsets[{key}]={self.group_token_offsets[key]} does not match expected {running}"
+                )
+            running += V * n_in
+        if running != self.total_numeric_tokens:
+            raise ValueError(
+                f"total_numeric_tokens={self.total_numeric_tokens} does not match expected {running}"
+            )
+    def init_weights(self, std: float = 0.02):
+        for dec in self.group_v_decoders:
+            dec.init_weights(std=std)
+        if self.homoscedastic:
+            for p in self.group_s_params:
+                nn.init.zeros_(p)
+        else:
+            for dec in self.group_s_decoders:
+                dec.init_weights(std=0.0)
+    def forward(self, x_tokens: torch.Tensor):
+        if x_tokens.dim() != 3:
+            raise ValueError(f"x_tokens must be [B,T,H], got {tuple(x_tokens.shape)}")
+        B, T, H = x_tokens.shape
+        if H != self.hidden_size:
+            raise ValueError(f"hidden_size mismatch: got H={H}, expected {self.hidden_size}")
+        if T != self.total_numeric_tokens:
+            raise ValueError(f"token length mismatch: got T={T}, expected {self.total_numeric_tokens}")
+        value_out: Dict[int, torch.Tensor] = {}
+        s_out: Dict[int, torch.Tensor] = {}
+        for gi, n_in in enumerate(self.group_nins):
+            key = str(n_in)
+            start = int(self.group_token_offsets[key])
+            V = self.group_Vs[gi]
+            length = V * n_in
+            xg_tok = x_tokens[:, start:start + length, :]  # [B, V*n_in, H]
+            xg_tok4 = xg_tok.reshape(B, V, n_in, H)  # [B, V, n_in, H]
+            xg_flat = xg_tok4.reshape(B, V, n_in * H)  # [B, V, n_in*H]
+            # values: [B, V, n_in]
+            y = self.group_v_decoders[gi](xg_flat)
+            # s = log sigma^2: [B, V]
+            if self.homoscedastic:
+                s = self.group_s_params[gi].unsqueeze(0).expand(B, -1)
+            else:
+                x_var = xg_tok4.mean(dim=2)  # [B, V, H]
+                s = self.group_s_decoders[gi](x_var).squeeze(-1)  # [B, V]
+            value_out[n_in] = y
+            s_out[n_in] = s
+        return value_out, s_out
+# ============================================================
+# DEMO
+# ============================================================
+def _demo_main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--numeric_vocab_json", type=str, default="data/numeric_vocab.json")
+    parser.add_argument("--hidden_size", type=int, default=768)
+    parser.add_argument("--middle_size", type=int, default=-1,
+                        help="If <0 -> one-layer. If >=0 -> two-layer with this middle size.")
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--dtype", type=str, default="float32", choices=["float16", "bfloat16", "float32"])
+    args = parser.parse_args()
+    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
+    dtype = dtype_map[args.dtype]
+    # Directly load existing numeric vocab spec
+    spec = load_json(args.numeric_vocab_json)
+    print(f"Loaded numeric vocab spec from: {args.numeric_vocab_json}")
+    print(f"Groups (n_in -> V):", {int(g['n_in']): len(g['feature_names']) for g in spec["groups"]})
+    print("total_numeric_tokens:", spec["total_numeric_tokens"])
+    print("group_token_offsets:", spec["group_token_offsets"])
+    middle_size = None if args.middle_size < 0 else int(args.middle_size)
+    model = NumericDecoder(
+        hidden_size=args.hidden_size,
+        numeric_vocab_json=args.numeric_vocab_json,
+        middle_size=middle_size,
+    ).to(device=device, dtype=dtype)
+    model.eval()
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters (NumericDecoder): {total_params:,} (trainable: {trainable_params:,})")
+    B = args.batch_size
+    T = int(spec["total_numeric_tokens"])
+    H = args.hidden_size
+    x_tokens = torch.randn(B, T, H, device=device, dtype=dtype)
+    with torch.no_grad():
+        values_by_nin, s_by_nin = model(x_tokens)
+    print("Input tokens:", tuple(x_tokens.shape), x_tokens.dtype, x_tokens.device)
+    print("Decoded values:", {k: tuple(v.shape) for k, v in values_by_nin.items()})
+    print("Decoded s:", {k: tuple(s.shape) for k, s in s_by_nin.items()})
+    # values_by_nin[n_in]: [B, V, n_in]
+    # s_by_nin[n_in]:      [B, V]
+if __name__ == "__main__":
+    _demo_main()

modelling/embed_categorical.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# embed_categorical.py
+# -*- coding: utf-8 -*-
+"""
+Categorical embedding module for tabular transformer.
+Design:
+- Each categorical column = 1 token
+- Value embedding: ONE global lookup table using (offset + local_id)
+- ID embedding: ONE categorical column-ID embedding table
+- Explicit col_id stored in cat_vocab.json (no implicit ordering assumptions)
+Outputs:
+    local_ids [B,M]  ->  tokens [B,M,H]
+"""
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from utils import load_json, save_json
+SPECIAL_MASK = "__MASK__"
+# ============================================================
+# Meta → categorical column list
+# ============================================================
+def get_categorical_feature_names_from_meta(tabular_meta: Dict) -> List[str]:
+    """
+    Deterministic ordering:
+        alphabetical by feature name.
+    """
+    cols = []
+    for k, v in tabular_meta.items():
+        if v.get("dataclass") == "categorical" and not v.get("is_array_valued", False):
+            cols.append(k)
+    return sorted(cols)
+# ============================================================
+# Vocab spec
+# ============================================================
+@dataclass
+class CatColSpec:
+    name: str
+    col_id: int
+    offset: int
+    num_classes: int
+    mask_local_id: int
+    label2id: Dict[str, int]
+def build_cat_vocab_spec_from_meta(
+        tabular_meta: Dict,
+        categorical_feature_names: List[str],
+        label_order: str = "alpha",
+) -> Dict[str, CatColSpec]:
+    vocab: Dict[str, CatColSpec] = {}
+    offset = 0
+    for j, col in enumerate(categorical_feature_names):
+        info = tabular_meta[col]
+        class_stats = info.get("class_stats", {}) or {}
+        # deterministic label order
+        if label_order == "alpha":
+            labels = sorted(class_stats.keys())
+        elif label_order == "freq_desc":
+            labels = sorted(class_stats.keys(), key=lambda k: (-class_stats[k], k))
+        else:
+            raise ValueError("label_order must be alpha or freq_desc")
+        label2id = {lab: i for i, lab in enumerate(labels)}
+        mask_local_id = len(labels)
+        label2id[SPECIAL_MASK] = mask_local_id
+        spec = CatColSpec(
+            name=col,
+            col_id=j,  # EXPLICIT categorical column id
+            offset=offset,
+            num_classes=mask_local_id + 1,
+            mask_local_id=mask_local_id,
+            label2id=label2id,
+        )
+        vocab[col] = spec
+        offset += spec.num_classes
+    return vocab
+def save_cat_vocab_json(vocab: Dict[str, CatColSpec], path: str) -> None:
+    out = {}
+    for col, spec in vocab.items():
+        out[col] = {
+            "col_id": spec.col_id,
+            "offset": spec.offset,
+            "num_classes": spec.num_classes,
+            "mask_local_id": spec.mask_local_id,
+            "global_id_start": spec.offset,
+            "global_id_end": spec.offset + spec.num_classes - 1,
+            "label2id": spec.label2id,
+        }
+    save_json(out, path)
+# ============================================================
+# Embedding modules
+# ============================================================
+class CategoricalValueEmbedding(nn.Module):
+    """
+    Global value embedding using offsets.
+    """
+    def __init__(self, hidden_size: int, cat_vocab_json: str):
+        super().__init__()
+        spec = load_json(cat_vocab_json)
+        # sort by col_id to ensure consistent tensor layout
+        items = sorted(spec.items(), key=lambda x: x[1]["col_id"])
+        offsets = []
+        num_classes = []
+        col_ids = []
+        total_vocab = 0
+        for name, s in items:
+            offsets.append(int(s["offset"]))
+            num_classes.append(int(s["num_classes"]))
+            col_ids.append(int(s["col_id"]))
+            total_vocab = max(total_vocab, s["offset"] + s["num_classes"])
+        self.hidden_size = int(hidden_size)
+        self.total_vocab_size = int(total_vocab)
+        # Merge all classes to avoid many small nn.Embedding modules
+        self.emb = nn.Embedding(self.total_vocab_size, self.hidden_size)
+        self.register_buffer("offsets", torch.tensor(offsets, dtype=torch.long), persistent=True)
+        self.register_buffer("num_classes", torch.tensor(num_classes, dtype=torch.long), persistent=True)
+        self.register_buffer("col_ids", torch.tensor(col_ids, dtype=torch.long), persistent=True)
+    def init_weights(self, std=0.02):
+        nn.init.normal_(self.emb.weight, std=std)
+    def forward(self, local_ids: torch.LongTensor) -> torch.Tensor:
+        """
+        local_ids: [B,M]
+        returns:   [B,M,H]
+        """
+        if local_ids.dim() != 2:
+            raise ValueError("local_ids must be [B,M]")
+        B, M = local_ids.shape
+        if M != self.offsets.numel():
+            raise ValueError("Column count mismatch")
+        if torch.any(local_ids < 0):
+            raise ValueError("Negative local_id")
+        nc = self.num_classes.view(1, M).expand(B, M)
+        if torch.any(local_ids >= nc):
+            raise ValueError("local_ids out of range")
+        gid = self.offsets.view(1, M) + local_ids
+        return self.emb(gid)
+class CategoricalIdEmbedding(nn.Module):
+    """
+    Explicit categorical column ID embedding.
+    """
+    def __init__(self, hidden_size: int, cat_vocab_json: str):
+        super().__init__()
+        spec = load_json(cat_vocab_json)
+        items = sorted(spec.items(), key=lambda x: x[1]["col_id"])
+        col_ids = [s["col_id"] for _, s in items]
+        max_col_id = max(col_ids)
+        self.emb = nn.Embedding(max_col_id + 1, hidden_size)
+        self.register_buffer(
+            "cat_col_ids",
+            torch.tensor(col_ids, dtype=torch.long),
+            persistent=True,
+        )
+        self.hidden_size = hidden_size
+    def init_weights(self, std=0.02):
+        nn.init.normal_(self.emb.weight, std=std)
+    def forward(self, batch_size: int) -> torch.Tensor:
+        """
+        returns [B,M,H]
+        """
+        id_vec = self.emb(self.cat_col_ids)  # [M,H]
+        return id_vec.view(1, -1, self.hidden_size).expand(batch_size, -1, -1)
+class CategoricalEmbedding(nn.Module):
+    """
+    token = value_embedding + categorical_id_embedding
+    """
+    def __init__(self, hidden_size: int, cat_vocab_json: str):
+        super().__init__()
+        self.value_emb = CategoricalValueEmbedding(hidden_size, cat_vocab_json)
+        self.id_emb = CategoricalIdEmbedding(hidden_size, cat_vocab_json)
+    def init_weights(self, std=0.02):
+        self.value_emb.init_weights(std=std)
+        self.id_emb.init_weights(std=std)
+    def forward(
+            self,
+            local_ids: torch.LongTensor,  # [B, M]
+            valid_positions: Optional[torch.Tensor] = None,  # Bool [B,M] (True=valid) or indices [K,2]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            tokens: [B, M, H]
+            token_mask: [B, M] (1=valid, 0=invalid)
+        """
+        if local_ids.dim() != 2:
+            raise ValueError(f"local_ids must be [B,M], got {tuple(local_ids.shape)}")
+        B, M = local_ids.shape
+        tokens = self.value_emb(local_ids) + self.id_emb(B)  # [B,M,H]
+        # Default: all tokens are valid
+        valid = torch.ones((B, M), dtype=torch.bool, device=local_ids.device)
+        if valid_positions is not None:
+            if valid_positions.dtype == torch.bool:
+                if valid_positions.shape != (B, M):
+                    raise ValueError(
+                        f"valid_positions (bool) must be [B,M]=({B}, {M}), got {tuple(valid_positions.shape)}")
+                valid = valid_positions.to(device=local_ids.device)
+            else:
+                # Optional: support index pairs [K,2] where each row is (b_idx, m_idx) for valid positions
+                if valid_positions.dim() != 2 or valid_positions.size(1) != 2:
+                    raise ValueError("valid_positions (indices) must be [K,2] with (batch_idx, col_idx)")
+                valid = torch.zeros((B, M), dtype=torch.bool, device=local_ids.device)
+                b_idx = valid_positions[:, 0].to(device=local_ids.device, dtype=torch.long)
+                m_idx = valid_positions[:, 1].to(device=local_ids.device, dtype=torch.long)
+                valid[b_idx, m_idx] = True
+        # Token mask: 1=valid, 0=invalid
+        token_mask = valid.to(dtype=torch.long)  # [B,M]
+        # This is WRONG: we should allow __MASK__ to attend other columns
+        # # Invalid tokens must not contribute
+        # invalid = ~valid
+        # if invalid.any():
+        #     tokens = tokens.masked_fill(invalid.unsqueeze(-1), 0.0)
+        return tokens, token_mask
+# ============================================================
+# DEMO
+# ============================================================
+def _demo_main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tabular_meta", type=str, default="data/tabular_meta.json")
+    parser.add_argument("--cat_vocab_json", type=str, default="data/cat_vocab.json")
+    parser.add_argument("--hidden_size", type=int, default=768)
+    parser.add_argument("--batch_size", type=int, default=4)
+    args = parser.parse_args()
+    tabular_meta = load_json(args.tabular_meta)
+    cat_names = get_categorical_feature_names_from_meta(tabular_meta)
+    print(f"Found {len(cat_names)} categorical columns")
+    vocab = build_cat_vocab_spec_from_meta(tabular_meta, cat_names)
+    save_cat_vocab_json(vocab, args.cat_vocab_json)
+    print(f"Saved vocab to {args.cat_vocab_json}")
+    model = CategoricalEmbedding(
+        hidden_size=args.hidden_size,
+        cat_vocab_json=args.cat_vocab_json,
+    )
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters (CategoricalEmbedding): {total_params:,} (trainable: {trainable_params:,})")
+    B = args.batch_size
+    M = len(cat_names)
+    local_ids = torch.zeros((B, M), dtype=torch.long)
+    with torch.no_grad():
+        out, mask = model(local_ids)
+    print("local_ids:", tuple(local_ids.shape))
+    print("output:", tuple(out.shape))  # [B,M,H]
+    print("mask:", tuple(mask.shape))  # [B,M]
+if __name__ == "__main__":
+    _demo_main()

modelling/embed_numeric.py ADDED Viewed

	@@ -0,0 +1,547 @@

+# embed_numeric.py
+# -*- coding: utf-8 -*-
+"""
+Numeric embedding module for tabular transformer.
+Updates in this version:
+- numeric_vocab.json now includes:
+    - total_numeric_tokens
+    - group_token_offsets (by n_in)
+- demo_main prints total parameter count
+Design:
+- scalar numeric (n_in=1): 1 token
+- vector numeric (n_in=L): L tokens
+- per bucket (same n_in): GroupedMLP with per-column weights (no for-loop over columns)
+    input  : [B, V, n_in]
+    output : [B, V*n_in, H]
+- middle_size:
+    - None: 1-layer
+    - int : 2-layer (Linear -> GELU -> Linear)
+- NumericIdEmbedding:
+    - per numeric column id embedding [H]
+    - broadcast across that column's n_in tokens
+"""
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from utils import load_json, save_json, GroupedMLP
+# ============================================================
+# Meta parsing
+# ============================================================
+def infer_n_in_from_meta_item(info: Dict) -> int:
+    return int(info["array_length"]) if info["is_array_valued"] else 1
+def get_numeric_feature_names_and_dims_from_meta(tabular_meta: Dict) -> List[Tuple[str, int]]:
+    """
+    Return list of (feature_name, n_in) for numeric features.
+    Heuristic:
+    - info['dataclass'] == 'numeric' is treated as numeric.
+    """
+    out: List[Tuple[str, int]] = []
+    for name, info in tabular_meta.items():
+        if info.get("dataclass") != "numeric":
+            continue
+        n_in = infer_n_in_from_meta_item(info)
+        out.append((name, n_in))
+    # deterministic: group by n_in then name
+    out.sort(key=lambda x: (x[1], x[0]))
+    return out
+# ============================================================
+# Vocab/spec building
+# ============================================================
+@dataclass
+class NumColSpec:
+    name: str
+    col_id: int
+    n_in: int
+    group_index: int
+    index_within_group: int
+def build_numeric_vocab_spec_from_meta(tabular_meta: Dict) -> Dict:
+    """
+    Build numeric_vocab.json dict.
+    Output keys:
+      - ordered_feature_names
+      - features[name] = {col_id, n_in, group_index, index_within_group}
+      - groups = [{n_in, feature_names}, ...] sorted by n_in asc
+      - total_numeric_tokens
+      - group_token_offsets: { "<n_in>": <start_token_index> }
+        token order is groups by n_in asc, within group by feature name
+    """
+    feats = get_numeric_feature_names_and_dims_from_meta(tabular_meta)
+    if not feats:
+        raise ValueError("No numeric features found (dataclass=='numeric').")
+    # group by n_in
+    groups_map: Dict[int, List[str]] = {}
+    for name, n_in in feats:
+        groups_map.setdefault(n_in, []).append(name)
+    for n_in in groups_map:
+        groups_map[n_in] = sorted(groups_map[n_in])
+    group_nins = sorted(groups_map.keys())
+    groups: List[Dict] = []
+    ordered_feature_names: List[str] = []
+    for n_in in group_nins:
+        names = groups_map[n_in]
+        groups.append({"n_in": int(n_in), "feature_names": names})
+        ordered_feature_names.extend(names)
+    # build per-feature mapping
+    name_to_group: Dict[str, Tuple[int, int]] = {}
+    for gi, g in enumerate(groups):
+        for idx, nm in enumerate(g["feature_names"]):
+            name_to_group[nm] = (gi, idx)
+    features: Dict[str, Dict] = {}
+    for col_id, nm in enumerate(ordered_feature_names):
+        gi, idx = name_to_group[nm]
+        n_in = int(groups[gi]["n_in"])
+        features[nm] = {
+            "col_id": int(col_id),
+            "n_in": int(n_in),
+            "group_index": int(gi),
+            "index_within_group": int(idx),
+        }
+    # total tokens + group token offsets
+    total_numeric_tokens = 0
+    group_token_offsets: Dict[str, int] = {}
+    running = 0
+    for g in groups:
+        n_in = int(g["n_in"])
+        group_token_offsets[str(n_in)] = int(running)
+        V = len(g["feature_names"])
+        running += V * n_in
+        total_numeric_tokens += V * n_in
+    spec = {
+        "ordered_feature_names": ordered_feature_names,
+        "features": features,
+        "groups": groups,
+        "total_numeric_tokens": int(total_numeric_tokens),
+        "group_token_offsets": group_token_offsets,  # keys are strings to be JSON-friendly
+    }
+    return spec
+# ============================================================
+# Core modules
+# ============================================================
+class NumericIdEmbedding(nn.Module):
+    """
+    Per-numeric-column ID embedding in the GLOBAL numeric namespace.
+    Broadcast each global column id vector across its n_in tokens.
+    """
+    def __init__(self, num_numeric_cols: int, hidden_size: int):
+        super().__init__()
+        self.num_numeric_cols = int(num_numeric_cols)
+        self.hidden_size = int(hidden_size)
+        self.emb = nn.Embedding(self.num_numeric_cols, self.hidden_size)
+    def forward(self, global_col_ids: torch.LongTensor, batch_size: int, n_in: int) -> torch.Tensor:
+        """
+        global_col_ids: [V] in global numeric namespace
+        returns:        [B, V*n_in, H]
+        """
+        if global_col_ids.dim() != 1:
+            raise ValueError(f"global_col_ids must be [V], got {tuple(global_col_ids.shape)}")
+        V = global_col_ids.numel()
+        n_in = int(n_in)
+        id_vec = self.emb(global_col_ids)  # [V, H]
+        id_vec = id_vec.view(1, V, 1, self.hidden_size).expand(batch_size, V, n_in, self.hidden_size)
+        return id_vec.reshape(batch_size, V * n_in, self.hidden_size)
+    def init_weights(self, std: float = 0.02):
+        nn.init.normal_(self.emb.weight, std=std)
+class NumericMaskEmbedding(nn.Module):
+    """
+    Per-bucket numeric mask embedding.
+    Local to one (n_in) group / bucket.
+    Parameter shape:
+        [num_bucket_cols, n_in, H]
+    So missing numeric columns are represented by:
+        (bucket-local column index, sub-token index)
+    """
+    def __init__(self, num_bucket_cols: int, n_in: int, hidden_size: int):
+        super().__init__()
+        self.num_bucket_cols = int(num_bucket_cols)
+        self.n_in = int(n_in)
+        self.hidden_size = int(hidden_size)
+        self.emb = nn.Parameter(
+            torch.empty(self.num_bucket_cols, self.n_in, self.hidden_size)
+        )
+    def forward(self, local_col_ids: torch.LongTensor, batch_size: int) -> torch.Tensor:
+        """
+        local_col_ids: [V] bucket-local ids, usually 0 to V-1
+        returns:       [B, V*n_in, H]
+        """
+        if local_col_ids.dim() != 1:
+            raise ValueError(f"local_col_ids must be [V], got {tuple(local_col_ids.shape)}")
+        V = local_col_ids.numel()
+        mask_vec = self.emb[local_col_ids]  # [V, n_in, H]
+        mask_vec = mask_vec.unsqueeze(0).expand(batch_size, V, self.n_in, self.hidden_size)
+        return mask_vec.reshape(batch_size, V * self.n_in, self.hidden_size)
+    def init_weights(self, std: float = 0.02):
+        nn.init.normal_(self.emb, std=std)
+class NumericEmbedding(nn.Module):
+    """
+    Full numeric embedding for all numeric columns described by numeric_vocab.json.
+    Forward expects bucketed input:
+      values_by_nin: { n_in: x[B, V, n_in] }
+    where V must match the feature count and order of that n_in group.
+    Output token ordering:
+      groups by n_in ascending (as stored in spec["groups"]),
+      within each group by feature_names order.
+    """
+    def __init__(self, hidden_size: int, numeric_vocab_json: str, middle_size: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = int(hidden_size)
+        self.middle_size = None if middle_size is None else int(middle_size)
+        spec = load_json(numeric_vocab_json)
+        self.ordered_feature_names: List[str] = list(spec["ordered_feature_names"])
+        self.features: Dict[str, Dict] = dict(spec["features"])
+        self.groups: List[Dict] = list(spec["groups"])
+        self.total_numeric_tokens = int(spec.get("total_numeric_tokens", -1))
+        num_cols = len(self.ordered_feature_names)
+        # Global numeric namespace id embedding
+        self.id_emb = NumericIdEmbedding(
+            num_numeric_cols=num_cols,
+            hidden_size=self.hidden_size,
+        )
+        # Per-group mask embedding
+        self.mask_emb = nn.ModuleDict()
+        # Per-group value embedding
+        self.group_mlps = nn.ModuleList()
+        self.group_nins: List[int] = []
+        self._num_groups = len(self.groups)
+        # Optional: useful for debugging / downstream checks
+        self.group_sizes: List[int] = []
+        # Build one block per group
+        for gi, g in enumerate(self.groups):
+            n_in = int(g["n_in"])
+            names = list(g["feature_names"])
+            V = len(names)
+            self.group_nins.append(n_in)
+            self.group_sizes.append(V)
+            # ---- spec consistency check
+            # group_index and index_within_group in features must match groups[gi]["feature_names"] order
+            local_ids = []
+            for local_idx, nm in enumerate(names):
+                f = self.features[nm]
+                if int(f["group_index"]) != gi:
+                    raise ValueError(
+                        f"Feature {nm} has group_index={f['group_index']}, expected {gi}"
+                    )
+                if int(f["n_in"]) != n_in:
+                    raise ValueError(
+                        f"Feature {nm} has n_in={f['n_in']}, expected {n_in}"
+                    )
+                if int(f["index_within_group"]) != local_idx:
+                    raise ValueError(
+                        f"Feature {nm} has index_within_group={f['index_within_group']}, expected {local_idx}"
+                    )
+                local_ids.append(int(f["index_within_group"]))
+            # strict check: local ids must be exactly 0 to V-1 with no gap / no duplicate
+            if sorted(local_ids) != list(range(V)):
+                raise ValueError(
+                    f"Group gi={gi}, n_in={n_in} has invalid index_within_group set: "
+                    f"got {sorted(local_ids)}, expected {list(range(V))}"
+                )
+            # ---- observed value path: bucket-local ordering
+            self.group_mlps.append(
+                GroupedMLP(
+                    n_var=V,
+                    n_in=n_in,
+                    n_out=n_in * self.hidden_size,
+                    middle_size=self.middle_size,
+                )
+            )
+            # ---- global ids for NumericIdEmbedding
+            global_col_ids = [int(self.features[nm]["col_id"]) for nm in names]
+            self.register_buffer(
+                f"group_global_col_ids_{gi}",
+                torch.tensor(global_col_ids, dtype=torch.long),
+                persistent=True,
+            )
+            # ---- local ids for NumericMaskEmbedding
+            local_col_ids = [int(self.features[nm]["index_within_group"]) for nm in names]
+            self.register_buffer(
+                f"group_local_col_ids_{gi}",
+                torch.tensor(local_col_ids, dtype=torch.long),
+                persistent=True,
+            )
+            # one mask embedding per bucket
+            self.mask_emb[str(n_in)] = NumericMaskEmbedding(
+                num_bucket_cols=V,
+                n_in=n_in,
+                hidden_size=self.hidden_size,
+            )
+        if self.total_numeric_tokens < 0:
+            self.total_numeric_tokens = sum(
+                len(g["feature_names"]) * int(g["n_in"]) for g in self.groups
+            )
+    def init_weights(self, std: float = 0.02):
+        self.id_emb.init_weights(std=std)
+        for _, mask_mod in self.mask_emb.items():
+            mask_mod.init_weights(std=std)
+        for mlp in self.group_mlps:
+            mlp.init_weights(std=std)
+    def forward(
+            self,
+            values_by_nin: Dict[int, torch.Tensor],
+            valid_positions_by_nin: Optional[Dict[int, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            values_by_nin:
+                { n_in: x } where x is [B, V, n_in]
+                Missing numeric values are assumed already filled in x.
+            valid_positions_by_nin (optional):
+                { n_in: valid_cols } where valid_cols is BoolTensor [B, V]
+                True means this COLUMN is observed/valid.
+                Note:
+                    This is COLUMN-level mask, not token-level.
+                    It is expanded to token-level by repeating across n_in.
+        Returns:
+            tokens:     [B, total_numeric_tokens, H]
+            token_mask: [B, total_numeric_tokens] (1=valid, 0=missing)
+        """
+        outs = []
+        masks = []
+        batch_size = None
+        for gi, n_in in enumerate(self.group_nins):
+            if n_in not in values_by_nin:
+                raise KeyError(f"Missing bucket input for n_in={n_in}")
+            x = values_by_nin[n_in]  # [B, V, n_in]
+            if x.dim() != 3 or x.size(-1) != n_in:
+                raise ValueError(f"Bucket n_in={n_in} expects x [B,V,{n_in}], got {tuple(x.shape)}")
+            if batch_size is None:
+                batch_size = x.size(0)
+            elif x.size(0) != batch_size:
+                raise ValueError("All buckets must share the same batch size")
+            B, V, _ = x.shape
+            expected_V = self.group_sizes[gi]
+            if V != expected_V:
+                raise ValueError(
+                    f"Bucket n_in={n_in} expects V={expected_V}, got V={V}"
+                )
+            # column-level valid mask [B, V]
+            if valid_positions_by_nin is None:
+                valid_cols = torch.ones((B, V), dtype=torch.bool, device=x.device)
+            else:
+                if n_in not in valid_positions_by_nin:
+                    raise KeyError(f"Missing valid mask for bucket n_in={n_in}")
+                valid_cols = valid_positions_by_nin[n_in]
+                if valid_cols.dtype != torch.bool:
+                    raise ValueError(
+                        f"valid_positions_by_nin[{n_in}] must be bool tensor, got {valid_cols.dtype}"
+                    )
+                if valid_cols.shape != (B, V):
+                    raise ValueError(
+                        f"valid_positions_by_nin[{n_in}] must be [B,V]=[{B},{V}], got {tuple(valid_cols.shape)}"
+                    )
+                valid_cols = valid_cols.to(device=x.device)
+            # ---- observed numeric value embedding
+            mlp = self.group_mlps[gi]
+            param = next(mlp.parameters())
+            x = x.to(device=param.device, dtype=param.dtype)
+            # [B, V, n_in] -> [B, V, n_in*H]
+            y = mlp(x)
+            # [B, V, n_in*H] -> [B, V*n_in, H]
+            y_tok = y.view(B, V, n_in, self.hidden_size).reshape(B, V * n_in, self.hidden_size)
+            # [B, V] -> [B, V*n_in]
+            valid_tok = valid_cols.unsqueeze(-1).expand(B, V, n_in).reshape(B, V * n_in)
+            # ---- missing replacement: bucket-local mask embedding
+            local_col_ids = getattr(self, f"group_local_col_ids_{gi}")  # [V]
+            mask_tok = self.mask_emb[str(n_in)](local_col_ids, batch_size=B)
+            if (~valid_tok).any():
+                y_tok = torch.where(
+                    valid_tok.unsqueeze(-1),
+                    y_tok,
+                    mask_tok,
+                )
+            # ---- add global numeric column id embedding
+            global_col_ids = getattr(self, f"group_global_col_ids_{gi}")  # [V]
+            y_tok = y_tok + self.id_emb(global_col_ids, batch_size=B, n_in=n_in)
+            token_mask = valid_tok.to(dtype=torch.long)
+            outs.append(y_tok)
+            masks.append(token_mask)
+        tokens = torch.cat(outs, dim=1)
+        token_mask = torch.cat(masks, dim=1)
+        if token_mask.shape[:2] != tokens.shape[:2]:
+            raise RuntimeError("token_mask shape mismatch with tokens")
+        return tokens, token_mask
+# ============================================================
+# DEMO
+# ============================================================
+def _demo_main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tabular_meta", type=str, default="data/tabular_meta.json")
+    parser.add_argument("--numeric_vocab_json", type=str, default="data/numeric_vocab.json")
+    parser.add_argument("--hidden_size", type=int, default=768)
+    parser.add_argument("--middle_size", type=int, default=-1,
+                        help="If <0 -> one-layer. If >=0 -> two-layer with this middle size.")
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--dtype", type=str, default="float32", choices=["float16", "bfloat16", "float32"])
+    args = parser.parse_args()
+    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
+    dtype = dtype_map[args.dtype]
+    meta = load_json(args.tabular_meta)
+    spec = build_numeric_vocab_spec_from_meta(meta)
+    save_json(spec, args.numeric_vocab_json)
+    print(f"Saved numeric vocab spec to: {args.numeric_vocab_json}")
+    print(f"Groups (n_in -> V):", {g["n_in"]: len(g["feature_names"]) for g in spec["groups"]})
+    print("total_numeric_tokens:", spec["total_numeric_tokens"])
+    print("group_token_offsets:", spec["group_token_offsets"])
+    middle_size = None if args.middle_size < 0 else int(args.middle_size)
+    model = NumericEmbedding(
+        hidden_size=args.hidden_size,
+        numeric_vocab_json=args.numeric_vocab_json,
+        middle_size=middle_size,
+    ).to(device=device, dtype=dtype)
+    model.init_weights()
+    model.eval()
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters (NumericEmbedding): {total_params:,} (trainable: {trainable_params:,})")
+    # create demo inputs bucketed by n_in
+    B = args.batch_size
+    values_by_nin: Dict[int, torch.Tensor] = {}
+    valid_positions_by_nin: Dict[int, torch.Tensor] = {}
+    for g in spec["groups"]:
+        n_in = int(g["n_in"])
+        V = len(g["feature_names"])
+        # random numeric inputs
+        x = torch.randn(B, V, n_in, device=device, dtype=dtype)
+        values_by_nin[n_in] = x
+        # Build valid mask (column-level)
+        # shape: [B, V], True = valid
+        valid_cols = torch.ones((B, V), dtype=torch.bool, device=device)
+        # Mark first sample's first 2 columns as invalid
+        num_to_invalidate = min(2, V)
+        valid_cols[0, :num_to_invalidate] = False
+        valid_positions_by_nin[n_in] = valid_cols
+    with torch.no_grad():
+        out, mask = model(values_by_nin, valid_positions_by_nin)
+    print("Buckets:", {k: tuple(v.shape) for k, v in values_by_nin.items()})
+    print("Output tokens:", tuple(out.shape), out.dtype, out.device)  # [B, total_numeric_tokens, H]
+    print("Masks:", tuple(mask.shape), mask.dtype, mask.device)  # [B, total_numeric_tokens]
+    # ---- Inspect first sample
+    print("\nFirst sample mask (first 5 tokens):")
+    print(mask[0, :5])
+    print("\nFirst sample token L2 norms (first 5 tokens):")
+    print(out[0, :5].norm(dim=-1))
+    print("\nSecond sample mask (first 5 tokens):")
+    print(mask[1, :5])
+    print("\nSecond sample token L2 norms (first 5 tokens):")
+    print(out[1, :5].norm(dim=-1))
+if __name__ == "__main__":
+    _demo_main()

modelling/embed_vision_gemma3n.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# embed_vision_gemma3n.py
+# -*- coding: utf-8 -*-
+import os
+from typing import Optional, Tuple, Dict
+import torch
+import torch.nn as nn
+from safetensors.torch import load_file as safetensors_load_file
+from transformers import AutoConfig, AutoModel
+from transformers.models.gemma3n.modeling_gemma3n import Gemma3nMultimodalEmbedder  # noqa
+from utils import load_json
+def _split_state_dict_from_tmp(sd: Dict[str, torch.Tensor]) \
+        -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+    """
+    Model extractor saved tmp.state_dict() where tmp has attributes:
+      - vision_tower
+      - embed_vision (optional)
+    So keys look like:
+      - vision_tower.xxx
+      - embed_vision.xxx
+    """
+    vt = {}
+    ev = {}
+    for k, v in sd.items():
+        if k.startswith("vision_tower."):
+            vt[k[len("vision_tower."):]] = v
+        elif k.startswith("embed_vision."):
+            ev[k[len("embed_vision."):]] = v
+    return vt, ev
+# ============================================================
+# Optional lightweight learnable token reducer
+# ============================================================
+class VisionTokenReducer(nn.Module):
+    """
+    Perceiver-style learnable cross-attention pooling with optional bottleneck.
+    Base (no bottleneck):
+        [B,T,D] -> [B,K,D]
+    Bottleneck mode (bottleneck_dim=d):
+        [B,T,D] -> down -> [B,T,d] -> cross-attn -> [B,K,d] -> (optional up) -> [B,K,D]
+    Notes:
+    - num_heads does NOT change parameter count of MultiheadAttention (depends on D only).
+    - perform_norm_latent controls whether to pre-norm the learnable latent queries.
+    """
+    def __init__(
+            self,
+            vision_dim: int,
+            num_output_tokens: int,
+            num_heads: int = 4,
+            perform_norm_latent: bool = True,
+            bottleneck_dim: Optional[int] = None,
+            project_back: bool = True,
+    ):
+        super().__init__()
+        self.vision_dim = int(vision_dim)
+        self.num_output_tokens = int(num_output_tokens)
+        self.num_heads = int(num_heads)
+        self.perform_norm_latent = bool(perform_norm_latent)
+        self.bottleneck_dim = None if bottleneck_dim is None else int(bottleneck_dim)
+        self.project_back = bool(project_back)
+        # Decide the attention working dimension: D (base) or d (bottleneck)
+        attn_dim = self.vision_dim if self.bottleneck_dim is None else self.bottleneck_dim
+        if attn_dim % self.num_heads != 0:
+            raise ValueError(f"embed_dim ({attn_dim}) must be divisible by num_heads ({self.num_heads})")
+        # Optional projection layers for bottleneck mode
+        if self.bottleneck_dim is None:
+            self.down = None
+            self.up = None
+        else:
+            # bias=False keeps it lightweight; switch to True if you prefer
+            self.down = nn.Linear(self.vision_dim, attn_dim, bias=False)
+            self.up = nn.Linear(attn_dim, self.vision_dim, bias=False) if self.project_back else None
+        # Learnable latent tokens (K, attn_dim)
+        self.latents = nn.Parameter(torch.randn(self.num_output_tokens, attn_dim) * 0.02)
+        # Separate norms: typically more stable than sharing one LN
+        self.norm_latents = nn.LayerNorm(attn_dim)
+        self.norm_x = nn.LayerNorm(attn_dim)
+        # Cross-attention: query=latents, key/value=x
+        self.attn = nn.MultiheadAttention(
+            embed_dim=attn_dim,
+            num_heads=self.num_heads,
+            batch_first=True,
+        )
+    def init_weights(self, std: float = 0.02):
+        # Optional bottleneck projections
+        if self.down is not None:
+            nn.init.normal_(self.down.weight, std=std)
+        if self.up is not None:
+            nn.init.normal_(self.up.weight, std=std)
+        # Learnable latent queries
+        nn.init.normal_(self.latents, std=std)
+        # LayerNorm
+        nn.init.ones_(self.norm_latents.weight)
+        nn.init.zeros_(self.norm_latents.bias)
+        nn.init.ones_(self.norm_x.weight)
+        nn.init.zeros_(self.norm_x.bias)
+        # MultiheadAttention: use PyTorch's own reset only
+        self.attn._reset_parameters()  # noqa
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [B, T, D] where D == vision_dim
+        Returns:
+            out: [B, K, D] if (bottleneck_dim is None) or project_back=True
+                 [B, K, d] if bottleneck_dim is not None and project_back=False
+        """
+        if x.dim() != 3:
+            raise ValueError(f"Expected x [B,T,D], got {tuple(x.shape)}")
+        if x.size(-1) != self.vision_dim:
+            raise ValueError(f"Expected last dim D={self.vision_dim}, got {x.size(-1)}")
+        B = x.size(0)
+        # Bottleneck projection if enabled
+        if self.down is not None:
+            x = self.down(x)  # [B,T,d]
+        # Expand learnable latents across batch
+        latents = self.latents.unsqueeze(0).expand(B, -1, -1)  # [B,K,attn_dim]
+        # Pre-norm (optional for latents, always for input tokens)
+        if self.perform_norm_latent:
+            latents = self.norm_latents(latents)
+        x = self.norm_x(x)
+        # Cross-attention pooling
+        out, _ = self.attn(query=latents, key=x, value=x)  # [B,K,attn_dim]
+        # Project back to original dim if requested
+        if self.up is not None:
+            out = self.up(out)  # [B,K,D]
+        return out
+# ============================================================
+# Main body
+# ============================================================
+class Gemma3nVisionFeatureExtractor(nn.Module):
+    """
+    Vision-only feature extractor for Gemma-3n that matches transformers' Gemma3nModel.get_image_features().
+    Input:  pixel_values [B, 3, H, W]
+    Output: image_features [B, vision_soft_tokens_per_image, text_hidden_size]
+    """
+    def __init__(
+            self,
+            vision_tower: nn.Module,
+            embed_vision: Optional[nn.Module],
+            vision_hidden_size: int,
+            vision_soft_tokens_per_image: int,
+            text_hidden_size: int,
+            num_output_tokens_reduced: Optional[int] = None,
+            num_heads_for_token_reduction: int = 4,
+            perform_norm_latent_for_token_reduction: bool = True,
+            reducer_bottleneck_dim: Optional[int] = None,
+            reducer_project_back: bool = True,
+    ):
+        super().__init__()
+        self.vision_tower = vision_tower
+        self.embed_vision = embed_vision
+        self.vision_hidden_size = int(vision_hidden_size)
+        self.vision_soft_tokens_per_image = int(vision_soft_tokens_per_image)
+        self.text_hidden_size = int(text_hidden_size)
+        self.has_embed_vision = embed_vision is not None
+        # Freeze vision modules
+        self.vision_tower.requires_grad_(False)
+        if self.embed_vision is not None:
+            self.embed_vision.requires_grad_(False)
+        # Reduce number of tokens
+        if num_output_tokens_reduced is not None:
+            reducer_dim = text_hidden_size if self.has_embed_vision else vision_hidden_size
+            self.reducer = VisionTokenReducer(
+                vision_dim=reducer_dim,
+                num_output_tokens=num_output_tokens_reduced,
+                num_heads=num_heads_for_token_reduction,
+                perform_norm_latent=perform_norm_latent_for_token_reduction,
+                bottleneck_dim=reducer_bottleneck_dim,
+                project_back=reducer_project_back,
+            )
+        else:
+            self.reducer = None
+    def init_weights(self, std: float = 0.02):
+        if self.reducer is not None:
+            self.reducer.init_weights(std)
+    def get_actual_hidden_dim(self) -> int:
+        """
+        Return the actual feature hidden dimension produced by this extractor.
+        The output dimension depends on:
+          - whether embed_vision is used
+          - whether a reducer is present
+          - reducer bottleneck + project_back configuration
+        Returns:
+            int: feature hidden size of output tokens
+        """
+        # Base dimension before reducer
+        base_dim = self.text_hidden_size if self.has_embed_vision else self.vision_hidden_size
+        # No reducer
+        if self.reducer is None:
+            return base_dim
+        # Reducer without bottleneck
+        if self.reducer.bottleneck_dim is None:
+            return base_dim
+        # Bottleneck reducer
+        if self.reducer.project_back:
+            return base_dim
+        # Bottleneck without projection back
+        return int(self.reducer.bottleneck_dim)
+    def train(self, mode: bool = True) -> "Gemma3nVisionFeatureExtractor":
+        """ Override train(): vision is not trainable"""
+        super().train(mode=mode)
+        self.vision_tower.eval()
+        if self.embed_vision is not None:
+            self.embed_vision.eval()
+        return self
+    def forward(
+            self,
+            pixel_values: torch.Tensor,
+            valid_positions: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            pixel_values: [B, 3, H, W]
+            valid_positions:
+                Indicates which samples have valid images.
+                Supported formats:
+                  - BoolTensor [B] where True means "has image"
+                  - LongTensor [K] with indices of samples that have images
+                If None: assume all samples have images.
+        Returns:
+            features:    [B, T_img, D]
+            vision_mask: [B, T_img] (1=valid vision token, 0=masked out)
+        """
+        if pixel_values.dim() != 4:
+            raise ValueError(f"pixel_values must be [B,3,H,W], got {tuple(pixel_values.shape)}")
+        B = pixel_values.size(0)
+        device = next(self.vision_tower.parameters()).device
+        dtype = next(self.vision_tower.parameters()).dtype
+        # --------------------------------------------------------
+        # Build per-sample valid-image mask
+        # --------------------------------------------------------
+        if valid_positions is None:
+            valid_mask = torch.ones(B, dtype=torch.bool, device=pixel_values.device)
+        else:
+            if valid_positions.dtype == torch.bool:
+                if valid_positions.shape != (B,):
+                    raise ValueError(f"valid_positions (bool) must be [B], got {tuple(valid_positions.shape)}")
+                valid_mask = valid_positions.to(device=pixel_values.device)
+            else:
+                if valid_positions.dim() != 1:
+                    raise ValueError(f"valid_positions (indices) must be 1D, got {tuple(valid_positions.shape)}")
+                valid_mask = torch.zeros(B, dtype=torch.bool, device=pixel_values.device)
+                valid_mask[valid_positions.to(device=pixel_values.device, dtype=torch.long)] = True
+        num_valid = int(valid_mask.sum().item())
+        # --------------------------------------------------------
+        # Figure out final output shape in advance
+        # --------------------------------------------------------
+        if self.reducer is None:
+            T_img = self.vision_soft_tokens_per_image
+        else:
+            T_img = self.reducer.num_output_tokens
+        D_out = self.get_actual_hidden_dim()
+        # vision_mask always returned for full batch
+        vision_mask = valid_mask[:, None].expand(B, T_img).to(dtype=torch.long)
+        # Fast path: no valid image at all
+        if num_valid == 0:
+            features = torch.zeros(B, T_img, D_out, device=device, dtype=dtype)
+            return features, vision_mask
+        # --------------------------------------------------------
+        # Run only valid samples through frozen vision stack
+        # --------------------------------------------------------
+        pixel_values_valid = pixel_values[valid_mask].to(device=device, dtype=dtype)
+        with torch.no_grad():
+            vision_last = self.vision_tower(
+                pixel_values=pixel_values_valid,
+                do_pooling=False,
+                return_dict=True,
+            ).last_hidden_state
+        if vision_last.dim() != 4:
+            raise RuntimeError(f"Expected vision last_hidden_state (B,C,h,w), got {tuple(vision_last.shape)}")
+        Bv, C, h, w = vision_last.shape
+        if Bv != num_valid:
+            raise RuntimeError("Batch size mismatch between valid pixel_values and vision_last")
+        if C != self.vision_hidden_size:
+            raise RuntimeError(f"Expected vision_hidden_size={self.vision_hidden_size}, got C={C}")
+        if h * w != self.vision_soft_tokens_per_image:
+            raise RuntimeError(
+                f"Expected h*w={self.vision_soft_tokens_per_image}, got {h * w}. "
+                f"Check processor image size/crop or config."
+            )
+        # (Bv, C, h, w) -> (Bv, C, HW) -> (Bv, HW, C)
+        vision_tokens = vision_last.reshape(Bv, C, self.vision_soft_tokens_per_image).permute(0, 2, 1).contiguous()
+        # Scale by sqrt(C) (matches Gemma codepath)
+        vision_tokens = vision_tokens * (self.vision_hidden_size ** 0.5)
+        # --------------------------------------------------------
+        # Extract valid-image features only
+        # --------------------------------------------------------
+        if not self.has_embed_vision:
+            valid_features = vision_tokens  # [Bv, HW, C]
+            if self.reducer is not None:
+                valid_features = self.reducer(valid_features)  # [Bv, T_img, C or d]
+        else:
+            with torch.no_grad():
+                valid_features = self.embed_vision(inputs_embeds=vision_tokens)
+            if valid_features.shape != (Bv, self.vision_soft_tokens_per_image, self.text_hidden_size):
+                raise RuntimeError(
+                    f"Bad output shape {tuple(valid_features.shape)}; expected "
+                    f"({Bv}, {self.vision_soft_tokens_per_image}, {self.text_hidden_size})"
+                )
+            if self.reducer is not None:
+                valid_features = self.reducer(valid_features)
+        # --------------------------------------------------------
+        # Scatter back to full batch; invalid samples stay zero
+        # --------------------------------------------------------
+        if valid_features.size(1) != T_img:
+            raise RuntimeError(f"T_img mismatch: expected {T_img}, got {valid_features.size(1)}")
+        if valid_features.size(2) != D_out:
+            raise RuntimeError(f"D_out mismatch: expected {D_out}, got {valid_features.size(2)}")
+        features = torch.zeros(B, T_img, D_out, device=valid_features.device, dtype=valid_features.dtype)
+        features[valid_mask] = valid_features
+        return features, vision_mask
+    @classmethod
+    def from_pretrained_vision_only_dir(
+            cls,
+            model_dir: str,
+            map_location: str = "cpu",
+            num_output_tokens_reduced: Optional[int] = None,
+            num_heads_for_token_reduction: int = 4,
+            perform_norm_latent_for_token_reduction: bool = True,
+            reducer_bottleneck_dim: Optional[int] = None,
+            reducer_project_back: bool = True,
+    ) -> "Gemma3nVisionFeatureExtractor":
+        weights_path = os.path.join(model_dir, "model.safetensors")
+        if not os.path.isfile(weights_path):
+            raise FileNotFoundError(f"Missing weights: {weights_path}")
+        ve_cfg_path = os.path.join(model_dir, "vision_extractor_config.json")
+        if not os.path.isfile(ve_cfg_path):
+            raise FileNotFoundError(f"Missing {ve_cfg_path}")
+        ve_cfg = load_json(ve_cfg_path)
+        vision_soft_tokens_per_image = int(ve_cfg.get("vision_soft_tokens_per_image", 256))
+        vision_hidden_size = int(ve_cfg.get("vision_hidden_size", -1))
+        text_hidden_size = int(ve_cfg.get("text_hidden_size", -1))
+        has_embed_vision = bool(ve_cfg.get("has_embed_vision", True))
+        if vision_hidden_size <= 0:
+            raise ValueError("vision_hidden_size missing/invalid in vision_extractor_config.json")
+        if has_embed_vision and text_hidden_size <= 0:
+            raise ValueError("text_hidden_size missing/invalid in vision_extractor_config.json")
+        cfg = AutoConfig.from_pretrained(model_dir, trust_remote_code=True, local_files_only=True)
+        vision_cfg = getattr(cfg, "vision_config", cfg)
+        text_cfg = getattr(cfg, "text_config", None)
+        vision_tower = AutoModel.from_config(vision_cfg, trust_remote_code=True)
+        embed_vision = None
+        if has_embed_vision:
+            if text_cfg is None:
+                raise RuntimeError(
+                    "config.json does not contain text_config, but has_embed_vision=True. "
+                    "You need a Gemma3nConfig-like config.json in this folder."
+                )
+            embed_vision = Gemma3nMultimodalEmbedder(vision_cfg, text_cfg)
+        sd = safetensors_load_file(weights_path, device=map_location)
+        vt_sd, ev_sd = _split_state_dict_from_tmp(sd)
+        if not vt_sd:
+            raise RuntimeError("No vision_tower.* keys found in model.safetensors")
+        if has_embed_vision and not ev_sd:
+            raise RuntimeError("has_embed_vision=True but no embed_vision.* keys found in model.safetensors")
+        missing_vt, unexpected_vt = vision_tower.load_state_dict(vt_sd, strict=True)
+        if missing_vt or unexpected_vt:
+            raise RuntimeError(f"vision_tower load mismatch: missing={missing_vt}, unexpected={unexpected_vt}")
+        if has_embed_vision:
+            missing_ev, unexpected_ev = embed_vision.load_state_dict(ev_sd, strict=True)
+            if missing_ev or unexpected_ev:
+                raise RuntimeError(f"embed_vision load mismatch: missing={missing_ev}, unexpected={unexpected_ev}")
+        vision_tower.eval()
+        if embed_vision is not None:
+            embed_vision.eval()
+        model = cls(
+            vision_tower=vision_tower,
+            embed_vision=embed_vision,
+            vision_hidden_size=vision_hidden_size,
+            vision_soft_tokens_per_image=vision_soft_tokens_per_image,
+            text_hidden_size=text_hidden_size if has_embed_vision else vision_hidden_size,
+            num_output_tokens_reduced=num_output_tokens_reduced,
+            num_heads_for_token_reduction=num_heads_for_token_reduction,
+            perform_norm_latent_for_token_reduction=perform_norm_latent_for_token_reduction,
+            reducer_bottleneck_dim=reducer_bottleneck_dim,
+            reducer_project_back=reducer_project_back,
+        )
+        model.eval()
+        return model
+def _demo_main():
+    import argparse
+    from PIL import Image
+    from transformers import AutoProcessor
+    from pathlib import Path
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_dir", type=str, default="./model_weights/gemma3n_E2B_vision_only")
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--dtype", type=str, default="float32", choices=["bfloat16", "float16", "float32"])
+    parser.add_argument("--num_output_tokens_reduced", type=int, default=32)
+    parser.add_argument("--reducer_bottleneck_dim", type=int, default=768)
+    parser.add_argument("--reducer_project_back", action="store_true")
+    args = parser.parse_args()
+    model_dir = str(Path(args.model_dir).resolve())
+    # Force local loading
+    processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True, local_files_only=True)
+    model = Gemma3nVisionFeatureExtractor.from_pretrained_vision_only_dir(
+        model_dir=model_dir,
+        map_location="cpu",
+        num_output_tokens_reduced=args.num_output_tokens_reduced,
+        num_heads_for_token_reduction=4,
+        reducer_bottleneck_dim=args.reducer_bottleneck_dim,
+        reducer_project_back=args.reducer_project_back,
+    )
+    model.init_weights()
+    model.to(device=args.device, dtype=args.dtype)
+    model.eval()
+    def count_params(module):
+        return sum(p.numel() for p in module.parameters())
+    vision_params = count_params(model.vision_tower)
+    embed_params = 0
+    if model.has_embed_vision and model.embed_vision is not None:
+        embed_params = count_params(model.embed_vision)
+    reducer_params = 0
+    if model.reducer is not None:
+        reducer_params = count_params(model.reducer)
+    frozen_params = vision_params + embed_params
+    total_params = frozen_params + reducer_params
+    print(f"Vision tower parameters (frozen): {vision_params:,}")
+    if model.has_embed_vision:
+        print(f"Embed vision parameters (frozen): {embed_params:,}")
+    else:
+        print("Embed vision: NONE")
+    if model.reducer is not None:
+        print(f"Reducer parameters (trainable): {reducer_params:,}")
+    else:
+        print("Reducer: NONE")
+    print(f"Total frozen parameters: {frozen_params:,}")
+    print(f"Total trainable parameters: {reducer_params:,}")
+    print(f"Total parameters: {total_params:,}")
+    img1 = Image.new("RGB", (768, 768), color=(0, 0, 0))
+    img2 = Image.new("RGB", (768, 768), color=(255, 255, 255))
+    inputs = processor(
+        text=["", ""],
+        images=[[img1], [img2]],
+        return_tensors="pt",
+    )
+    pixel_values = inputs["pixel_values"].to(
+        device=next(model.parameters()).device,
+        dtype=next(model.parameters()).dtype,
+    )
+    print("pixel_values:", tuple(pixel_values.shape), pixel_values.dtype, pixel_values.device)
+    with torch.no_grad():
+        feats, masks = model(pixel_values)
+    print("features:", tuple(feats.shape), feats.dtype, feats.device)
+    print("masks:", tuple(masks.shape), masks.dtype, masks.device)
+if __name__ == "__main__":
+    _demo_main()

modelling/layer.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# layer.py
+# -*- coding: utf-8 -*-
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = float(eps)
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [..., dim]
+        x_float = x.float()
+        rms = x_float.pow(2).mean(dim=-1, keepdim=True).add(self.eps).sqrt()
+        y = (x_float / rms).to(dtype=x.dtype)
+        return y * self.weight.to(dtype=x.dtype, device=x.device)
+class SwiGLU(nn.Module):
+    @staticmethod
+    def forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+        return nn.functional.silu(gate) * up
+class TabularImageGQALayer(nn.Module):
+    """
+    Pre-norm Transformer block with:
+      - Tabular tokens produce Q; tabular+image produce KV (image optional)
+      - GQA: num_query_heads is a multiple of num_kv_heads
+      - Numeric+categorical must be concatenated before calling this layer (one tabular stream)
+      - attention_mask is 1D [B, T_tab] and does not include vision tokens
+      - If vision_features is None, attention is tabular-only
+      - Vision tokens are not updated (no Q for vision)
+    """
+    def __init__(
+            self,
+            tabular_dim: int,
+            vision_dim: int,
+            num_query_heads: int,
+            num_kv_heads: int,
+            head_dim: int,
+            mlp_ratio: float = 4.0,
+            dropout: float = 0.0,
+            rmsnorm_eps: float = 1e-6,
+    ):
+        super().__init__()
+        if num_query_heads % num_kv_heads != 0:
+            raise ValueError("num_query_heads must be a multiple of num_kv_heads")
+        self.tabular_dim = int(tabular_dim)
+        self.vision_dim = int(vision_dim)
+        self.num_query_heads = int(num_query_heads)
+        self.num_kv_heads = int(num_kv_heads)
+        self.head_dim = int(head_dim)
+        self.q_dim = self.num_query_heads * self.head_dim
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.group_size = self.num_query_heads // self.num_kv_heads
+        self.attn_norm = RMSNorm(self.tabular_dim, eps=rmsnorm_eps)
+        # Tabular projections (shared for numeric+categorical stream)
+        self.q_proj_tab = nn.Linear(self.tabular_dim, self.q_dim, bias=False)
+        self.k_proj_tab = nn.Linear(self.tabular_dim, self.kv_dim, bias=False)
+        self.v_proj_tab = nn.Linear(self.tabular_dim, self.kv_dim, bias=False)
+        # Vision KV projections (separate; vision has no Q)
+        self.k_proj_img = nn.Linear(self.vision_dim, self.kv_dim, bias=False)
+        self.v_proj_img = nn.Linear(self.vision_dim, self.kv_dim, bias=False)
+        self.o_proj = nn.Linear(self.q_dim, self.tabular_dim, bias=False)
+        self.attn_dropout = float(dropout)
+        self.resid_dropout = float(dropout)
+        # FFN (LLM-style: gated MLP with SwiGLU)
+        self.ffn_norm = RMSNorm(self.tabular_dim, eps=rmsnorm_eps)
+        ffn_dim = int(round(self.tabular_dim * float(mlp_ratio)))
+        self.gate_proj = nn.Linear(self.tabular_dim, ffn_dim, bias=False)
+        self.up_proj = nn.Linear(self.tabular_dim, ffn_dim, bias=False)
+        self.down_proj = nn.Linear(ffn_dim, self.tabular_dim, bias=False)
+        self.act = SwiGLU()
+    def init_weights(self, std: float = 0.02):
+        # RMSNorm
+        nn.init.ones_(self.attn_norm.weight)
+        nn.init.ones_(self.ffn_norm.weight)
+        # Attention projections
+        nn.init.normal_(self.q_proj_tab.weight, std=std)
+        nn.init.normal_(self.k_proj_tab.weight, std=std)
+        nn.init.normal_(self.v_proj_tab.weight, std=std)
+        nn.init.normal_(self.k_proj_img.weight, std=std)
+        nn.init.normal_(self.v_proj_img.weight, std=std)
+        nn.init.normal_(self.o_proj.weight, std=std)
+        # FFN
+        nn.init.normal_(self.gate_proj.weight, std=std)
+        nn.init.normal_(self.up_proj.weight, std=std)
+        nn.init.normal_(self.down_proj.weight, std=std)
+    @staticmethod
+    def _make_key_bias_from_mask(mask_1d: torch.Tensor, key_len: int) -> torch.Tensor:
+        """
+        mask_1d: [B, T_key] with 1=keep, 0=mask
+        returns: [B, 1, 1, T_key] float bias with 0 for keep and -inf for mask
+        """
+        if mask_1d.dtype != torch.float32:
+            mask_f = mask_1d.float()
+        else:
+            mask_f = mask_1d
+        if mask_f.shape[1] != key_len:
+            raise ValueError(f"mask_1d width mismatch: got {mask_f.shape[1]} expected {key_len}")
+        bias = (1.0 - mask_f) * -1e9
+        return bias.view(mask_f.shape[0], 1, 1, key_len)
+    def _split_heads_q(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [B, T, Hq*d] -> [B, Hq, T, d]
+        B, T, _ = x.shape
+        return x.view(B, T, self.num_query_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _split_heads_kv(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [B, T, Hkv*d] -> [B, Hkv, T, d]
+        B, T, _ = x.shape
+        return x.view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2).contiguous()
+    @staticmethod
+    def _merge_heads_q(x: torch.Tensor) -> torch.Tensor:
+        # x: [B, Hq, T, d] -> [B, T, Hq*d]
+        B, H, T, d = x.shape
+        return x.transpose(1, 2).contiguous().view(B, T, H * d)
+    def forward(
+            self,
+            x_tab: torch.Tensor,
+            attention_mask: torch.Tensor,
+            vision_features: Optional[torch.Tensor] = None,
+            vision_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        x_tab: [B, T_tab, tabular_dim]
+        attention_mask: [B, T_tab] (1=valid tab token, 0=masked tab token). Does NOT include vision.
+        vision_features: None or [B, T_img, vision_dim]
+        vision_mask: None or [B, T_img] (1=valid vision token, 0=masked). Required if vision_features is not None.
+        returns: updated x_tab [B, T_tab, tabular_dim]
+        """
+        if x_tab.dim() != 3:
+            raise ValueError(f"x_tab must be [B,T,D], got {tuple(x_tab.shape)}")
+        if attention_mask.dim() != 2:
+            raise ValueError(f"attention_mask must be [B,T_tab], got {tuple(attention_mask.shape)}")
+        B, T_tab, D = x_tab.shape
+        if D != self.tabular_dim:
+            raise ValueError(f"tabular_dim mismatch: got {D}, expected {self.tabular_dim}")
+        if attention_mask.shape != (B, T_tab):
+            raise ValueError("attention_mask shape mismatch with x_tab")
+        if attention_mask.device != x_tab.device:
+            attention_mask = attention_mask.to(device=x_tab.device)
+        # ---- Attention block (pre-norm)
+        h = self.attn_norm(x_tab)
+        q_tab = self.q_proj_tab(h)  # [B, T_tab, Hq*d]
+        k_tab = self.k_proj_tab(h)  # [B, T_tab, Hkv*d]
+        v_tab = self.v_proj_tab(h)  # [B, T_tab, Hkv*d]
+        q = self._split_heads_q(q_tab)  # [B, Hq,  T_tab, d]
+        k_tab = self._split_heads_kv(k_tab)  # [B, Hkv, T_tab, d]
+        v_tab = self._split_heads_kv(v_tab)  # [B, Hkv, T_tab, d]
+        if vision_features is None:
+            # Keys/values = tab only
+            k = k_tab
+            v = v_tab
+            key_mask = attention_mask  # [B, T_tab]
+        else:
+            if vision_features.dim() != 3:
+                raise ValueError(f"vision_features must be [B,T_img,Dv], got {tuple(vision_features.shape)}")
+            if vision_features.shape[0] != B:
+                raise ValueError("vision_features batch mismatch")
+            if vision_features.shape[2] != self.vision_dim:
+                raise ValueError(f"vision_dim mismatch: got {vision_features.shape[2]}, expected {self.vision_dim}")
+            # Require vision_mask for strict missing handling
+            if vision_mask is None:
+                raise ValueError("vision_mask must be provided when vision_features is not None")
+            if vision_mask.dim() != 2:
+                raise ValueError(f"vision_mask must be [B,T_img], got {tuple(vision_mask.shape)}")
+            T_img = vision_features.shape[1]
+            if vision_mask.shape != (B, T_img):
+                raise ValueError(f"vision_mask shape mismatch: expected {(B, T_img)}, got {tuple(vision_mask.shape)}")
+            # Ensure mask dtype matches attention_mask dtype for concatenation
+            if vision_mask.dtype != attention_mask.dtype:
+                vision_mask = vision_mask.to(dtype=attention_mask.dtype)
+            if vision_mask.device != attention_mask.device:
+                vision_mask = vision_mask.to(device=attention_mask.device)
+            param = self.k_proj_img.weight
+            vision_features = vision_features.to(device=param.device, dtype=param.dtype)
+            k_img = self.k_proj_img(vision_features)  # [B, T_img, Hkv*d]
+            v_img = self.v_proj_img(vision_features)  # [B, T_img, Hkv*d]
+            k_img = self._split_heads_kv(k_img)  # [B, Hkv, T_img, d]
+            v_img = self._split_heads_kv(v_img)  # [B, Hkv, T_img, d]
+            k = torch.cat([k_tab, k_img], dim=2)  # [B, Hkv, T_tab+T_img, d]
+            v = torch.cat([v_tab, v_img], dim=2)  # [B, Hkv, T_tab+T_img, d]
+            # STRICT key mask: tab_mask + vision_mask
+            key_mask = torch.cat([attention_mask, vision_mask], dim=1)  # [B, T_tab+T_img]
+        # Expand KV heads to Q heads (GQA)
+        if self.group_size != 1:
+            k = k.repeat_interleave(self.group_size, dim=1)  # [B, Hq, T_k, d]
+            v = v.repeat_interleave(self.group_size, dim=1)  # [B, Hq, T_k, d]
+        T_k = k.shape[2]
+        key_bias = self._make_key_bias_from_mask(key_mask, key_len=T_k)  # [B,1,1,T_k]
+        # Attention scores: [B, Hq, T_tab, T_k]
+        scale = 1.0 / math.sqrt(self.head_dim)
+        attn_scores = torch.einsum("bhtd,bhkd->bhtk", q, k) * scale
+        attn_scores = attn_scores + key_bias  # broadcast
+        attn_probs = F.softmax(attn_scores.float(), dim=-1)
+        if self.attn_dropout > 0.0 and self.training:
+            attn_probs = F.dropout(attn_probs, p=self.attn_dropout)
+        attn_probs = attn_probs.to(v.dtype)
+        attn_out = torch.einsum("bhtk,bhkd->bhtd", attn_probs, v)  # [B,Hq,T_tab,d]
+        attn_out = self._merge_heads_q(attn_out)  # [B,T_tab,Hq*d]
+        attn_out = self.o_proj(attn_out)  # [B,T_tab,tab_dim]
+        # Query-side masking (tab only): prevents masked tab tokens from updating residual path
+        attn_out = attn_out * attention_mask.to(attn_out.dtype).unsqueeze(-1)
+        if self.resid_dropout > 0.0 and self.training:
+            attn_out = F.dropout(attn_out, p=self.resid_dropout)
+        x = x_tab + attn_out
+        # ---- FFN block (pre-norm)
+        h2 = self.ffn_norm(x)
+        gate = self.gate_proj(h2)
+        up = self.up_proj(h2)
+        f = self.act(gate, up)
+        f = self.down_proj(f)
+        # Query-side masking (tab only)
+        f = f * attention_mask.to(f.dtype).unsqueeze(-1)
+        if self.resid_dropout > 0.0 and self.training:
+            f = F.dropout(f, p=self.resid_dropout)
+        x = x + f
+        return x
+def _count_params(m: nn.Module) -> Tuple[int, int]:
+    total = sum(p.numel() for p in m.parameters())
+    trainable = sum(p.numel() for p in m.parameters() if p.requires_grad)
+    return total, trainable
+def _demo_main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--t_tab", type=int, default=126)
+    parser.add_argument("--t_img", type=int, default=256)
+    parser.add_argument("--tabular_dim", type=int, default=768)
+    parser.add_argument("--vision_dim", type=int, default=768)
+    parser.add_argument("--num_query_heads", type=int, default=8)
+    parser.add_argument("--num_kv_heads", type=int, default=2)
+    parser.add_argument("--head_dim", type=int, default=128)
+    parser.add_argument("--mlp_ratio", type=float, default=1.5)
+    parser.add_argument("--dropout", type=float, default=0.0)
+    parser.add_argument("--with_vision", action="store_true")
+    parser.add_argument("--dtype", type=str, default="float32", choices=["float16", "bfloat16", "float32"])
+    parser.add_argument("--device", type=str, default=None)
+    args = parser.parse_args()
+    device = torch.device(args.device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
+    dtype = dtype_map[args.dtype]
+    layer = TabularImageGQALayer(
+        tabular_dim=args.tabular_dim,
+        vision_dim=args.vision_dim,
+        num_query_heads=args.num_query_heads,
+        num_kv_heads=args.num_kv_heads,
+        head_dim=args.head_dim,
+        mlp_ratio=args.mlp_ratio,
+        dropout=args.dropout,
+    ).to(device=device, dtype=dtype)
+    total, trainable = _count_params(layer)
+    print(f"Layer parameters: {total:,} (trainable: {trainable:,})")
+    B = args.batch_size
+    T_tab = args.t_tab
+    x_tab = torch.randn(B, T_tab, args.tabular_dim, device=device, dtype=dtype)
+    # Build a typical HF-style 1D attention mask: 1 for valid, 0 for masked/padded.
+    # Here we create variable valid lengths.
+    lengths = torch.randint(low=max(1, T_tab // 2), high=T_tab + 1, size=(B,), device=device)
+    attention_mask = torch.zeros(B, T_tab, device=device, dtype=torch.long)
+    for b in range(B):
+        attention_mask[b, : int(lengths[b].item())] = 1
+    if args.with_vision:
+        vision = torch.randn(B, args.t_img, args.vision_dim, device=device, dtype=dtype)
+        # Example vision mask: first half valid for sample 0, all valid for others
+        vision_mask = torch.ones(B, args.t_img, device=device, dtype=torch.long)
+        if args.t_img > 0:
+            vision_mask[0, args.t_img // 2:] = 0
+    else:
+        vision = None
+        vision_mask = None
+    print("Input x_tab:", tuple(x_tab.shape), x_tab.dtype, x_tab.device)
+    print("Input attention_mask:", tuple(attention_mask.shape), attention_mask.dtype, attention_mask.device)
+    print("Input vision_features:", None if vision is None else (tuple(vision.shape), vision.dtype, vision.device))
+    print("Input vision_mask:",
+          None if vision_mask is None else (tuple(vision_mask.shape), vision_mask.dtype, vision_mask.device))
+    with torch.no_grad():
+        y = layer(
+            x_tab=x_tab,
+            attention_mask=attention_mask,
+            vision_features=vision,
+            vision_mask=vision_mask,
+        )
+    print("Output y_tab:", tuple(y.shape), y.dtype, y.device)
+if __name__ == "__main__":
+    _demo_main()

modelling/loader.py ADDED Viewed

	@@ -0,0 +1,1025 @@

+# loader.py
+# -*- coding: utf-8 -*-
+import ast
+from io import BytesIO
+from urllib.parse import urljoin
+import pandas as pd
+import requests
+import torch
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from utils import load_json
+class CenterSquareCrop:
+    """
+    Crop image to a centered square without resizing.
+    """
+    def __call__(self, img: Image.Image):
+        w, h = img.size
+        if w == h:
+            return img
+        if w > h:
+            left = (w - h) // 2
+            right = left + h
+            top = 0
+            bottom = h
+        else:
+            top = (h - w) // 2
+            bottom = top + w
+            left = 0
+            right = w
+        return img.crop((left, top, right, bottom))
+def build_image_transform(image_size: int):
+    return transforms.Compose([
+        CenterSquareCrop(),
+        transforms.Resize((image_size, image_size)),
+        transforms.ToTensor(),
+    ])
+def join_photo_root(photo_root: str, relative_path: str) -> str:
+    """
+    Join photo_root and relative path.
+    Supports:
+    - local filesystem roots
+    - http / https roots
+    """
+    if photo_root.startswith("http://") or photo_root.startswith("https://"):  # noqa
+        return urljoin(photo_root.rstrip("/") + "/", relative_path)
+    return photo_root.rstrip("/") + "/" + relative_path.lstrip("/")
+def parse_numeric_cell(value: str, n_in: int):
+    """
+    Convert numeric csv cell to list[float].
+    Returns:
+        values, is_valid
+    Data assumption:
+    - Empty value is always ""
+    - Scalar numeric -> "12.3"
+    - Vector numeric -> "[1.2,3.4,5.6]"
+    """
+    if value == "":
+        return [0.0] * n_in, False
+    if n_in == 1:
+        return [float(value)], True
+    vec = ast.literal_eval(value)
+    if len(vec) != n_in:
+        raise ValueError(f"Numeric vector length mismatch: expected {n_in}, got {len(vec)}")
+    return [float(v) for v in vec], True
+class SoilFormerDataset(Dataset):
+    def __init__(
+            self,
+            csv_path: str,
+            photo_map_path: str,
+            cat_vocab_path: str,
+            numeric_vocab_path: str,
+            numeric_stats_path: str,
+            photo_root: str,
+            image_size: int = 512,
+            id_column: str = "id",
+    ):
+        self.df = pd.read_csv(
+            csv_path,
+            keep_default_na=False,
+            na_filter=False,
+            low_memory=False,
+        )
+        self.photo_map = load_json(photo_map_path)
+        self.cat_vocab = load_json(cat_vocab_path)
+        self.numeric_vocab = load_json(numeric_vocab_path)
+        self.photo_root = photo_root
+        self.id_column = id_column
+        self.image_size = int(image_size)
+        self.image_transform = build_image_transform(self.image_size)
+        # Keep json order exactly
+        self.cat_columns = list(self.cat_vocab.keys())
+        self.numeric_groups = self.numeric_vocab["groups"]
+        self.numeric_stats_df = pd.read_csv(numeric_stats_path)
+        self.numeric_stats_index = self.numeric_stats_df.set_index("column")
+        # Numeric mean/std
+        self.numeric_stats = {}
+        for _, row in self.numeric_stats_df.iterrows():
+            col = row["column"]
+            mean = float(row["mean"])
+            std = float(row["std"])
+            if std == 0.0:
+                std = 1.0
+            self.numeric_stats[col] = (mean, std)
+        # For active masking
+        self.cat_mask_local_ids = torch.tensor(
+            [int(self.cat_vocab[col]["mask_local_id"]) for col in self.cat_columns],
+            dtype=torch.long,
+        )
+    def __len__(self):
+        return len(self.df)
+    def load_image(self, path: str):
+        if path.startswith("http://") or path.startswith("https://"):  # noqa
+            resp = requests.get(path, timeout=(3, 10))
+            resp.raise_for_status()
+            img = Image.open(BytesIO(resp.content)).convert("RGB")
+        else:
+            img = Image.open(path).convert("RGB")
+        return self.image_transform(img)
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        sample_id = row[self.id_column]
+        # -----------------------
+        # categorical features
+        # -----------------------
+        cat_ids = []
+        cat_valids = []
+        for col in self.cat_columns:
+            spec = self.cat_vocab[col]
+            label2id = spec["label2id"]
+            mask_id = spec["mask_local_id"]
+            value = row[col]
+            if value == "":
+                cat_ids.append(mask_id)
+                cat_valids.append(False)
+            else:
+                if value not in label2id:
+                    raise KeyError(f"Unknown categorical value: column={col}, value={value!r}")
+                cat_ids.append(label2id[value])
+                cat_valids.append(True)
+        cat_ids = torch.tensor(cat_ids, dtype=torch.long)
+        cat_valids = torch.tensor(cat_valids, dtype=torch.bool)
+        # -----------------------
+        # numeric features
+        # -----------------------
+        numeric_values_by_nin = {}
+        numeric_valid_positions_by_nin = {}
+        for group in self.numeric_groups:
+            n_in = int(group["n_in"])
+            features = group["feature_names"]
+            values = []
+            valids = []
+            for feat in features:
+                cell = row[feat]
+                parsed, is_valid = parse_numeric_cell(cell, n_in)
+                if is_valid:
+                    mean, std = self.numeric_stats[feat]
+                    parsed = [(v - mean) / std for v in parsed]
+                values.append(parsed)
+                valids.append(is_valid)
+            numeric_values_by_nin[n_in] = torch.tensor(values, dtype=torch.float32)
+            numeric_valid_positions_by_nin[n_in] = torch.tensor(valids, dtype=torch.bool)
+        # -----------------------
+        # vision
+        # -----------------------
+        try:
+            relative_path = self.photo_map[sample_id]
+            full_path = join_photo_root(self.photo_root, relative_path)
+            image = self.load_image(full_path)
+            vision_valid = True
+        except Exception:  # noqa
+            image = torch.zeros(3, self.image_size, self.image_size, dtype=torch.float32)
+            vision_valid = False
+        vision_valid = torch.tensor(vision_valid, dtype=torch.bool)
+        return {
+            "row_idx": torch.tensor(idx, dtype=torch.long),
+            "sample_id": sample_id,
+            "cat_local_ids": cat_ids,
+            "cat_valid_positions": cat_valids,
+            "numeric_values_by_nin": numeric_values_by_nin,
+            "numeric_valid_positions_by_nin": numeric_valid_positions_by_nin,
+            "pixel_values": image,
+            "vision_valid_positions": vision_valid,
+        }
+    @staticmethod
+    def collate_fn(batch):
+        cat_ids = torch.stack([b["cat_local_ids"] for b in batch], dim=0)
+        cat_valids = torch.stack([b["cat_valid_positions"] for b in batch], dim=0)
+        group_keys = list(batch[0]["numeric_values_by_nin"].keys())
+        numeric_values_by_nin = {}
+        numeric_valid_positions_by_nin = {}
+        for k in group_keys:
+            numeric_values_by_nin[k] = torch.stack(
+                [b["numeric_values_by_nin"][k] for b in batch],
+                dim=0,
+            )
+            numeric_valid_positions_by_nin[k] = torch.stack(
+                [b["numeric_valid_positions_by_nin"][k] for b in batch],
+                dim=0,
+            )
+        pixel_values = torch.stack([b["pixel_values"] for b in batch], dim=0)
+        vision_valid_positions = torch.stack([b["vision_valid_positions"] for b in batch], dim=0)
+        row_idx = torch.stack([b["row_idx"] for b in batch], dim=0)
+        sample_ids = [b["sample_id"] for b in batch]
+        return {
+            "row_idx": row_idx,
+            "sample_id": sample_ids,
+            "cat_local_ids": cat_ids,
+            "numeric_values_by_nin": numeric_values_by_nin,
+            "cat_valid_positions": cat_valids,
+            "numeric_valid_positions_by_nin": numeric_valid_positions_by_nin,
+            "pixel_values": pixel_values,
+            "vision_valid_positions": vision_valid_positions,
+        }
+    def perform_active_mask(self, batch, cat_ratio=0.15, num_ratio=0.15, seed=None):
+        """
+        Apply active masking to categorical and numeric inputs.
+        Conventions
+        -----------
+        Input batch must contain:
+          - cat_local_ids: [B, M] LongTensor
+          - cat_valid_positions: [B, M] Bool/0-1 tensor
+          - numeric_values_by_nin: Dict[int, Tensor[B, V, n_in]]
+          - numeric_valid_positions_by_nin: Dict[int, Tensor[B, V]]
+        Output batch will additionally contain:
+          - original_cat_local_ids
+          - original_cat_valid_positions
+          - original_numeric_values_by_nin
+          - original_numeric_valid_positions_by_nin
+          - masked_cat_local_ids
+          - masked_cat_valid_positions
+          - masked_numeric_values_by_nin
+          - masked_numeric_valid_positions_by_nin
+          - cat_loss_mask: [B, M] BoolTensor
+          - numeric_loss_mask_by_nin: Dict[int, BoolTensor[B, V]]
+        Semantics
+        ---------
+        - Only originally valid positions can be actively masked.
+        - Masked categorical positions:
+            local_id -> self.cat_mask_local_ids[col]
+            valid    -> False
+        - Masked numeric positions:
+            values   -> 0
+            valid    -> False
+        - original_* fields always preserve the unmodified input batch content.
+        """
+        # --------------------------------------------------
+        # Validate ratios
+        # --------------------------------------------------
+        if not (0.0 <= cat_ratio <= 1.0):
+            raise ValueError(f"cat_ratio must be in [0, 1], got {cat_ratio}")
+        if not (0.0 <= num_ratio <= 1.0):
+            raise ValueError(f"num_ratio must be in [0, 1], got {num_ratio}")
+        # --------------------------------------------------
+        # Validate required keys
+        # --------------------------------------------------
+        required_keys = [
+            "cat_local_ids",
+            "cat_valid_positions",
+            "numeric_values_by_nin",
+            "numeric_valid_positions_by_nin",
+        ]
+        for k in required_keys:
+            if k not in batch:
+                raise KeyError(f"Missing key in batch: {k}")
+        cat_local_ids = batch["cat_local_ids"]
+        cat_valid_positions = batch["cat_valid_positions"]
+        numeric_values_by_nin = batch["numeric_values_by_nin"]
+        numeric_valid_positions_by_nin = batch["numeric_valid_positions_by_nin"]
+        if cat_local_ids.dim() != 2:
+            raise ValueError(f"cat_local_ids must be [B, M], got {tuple(cat_local_ids.shape)}")
+        if cat_valid_positions.shape != cat_local_ids.shape:
+            raise ValueError(
+                f"cat_valid_positions must match cat_local_ids shape, got "
+                f"{tuple(cat_valid_positions.shape)} vs {tuple(cat_local_ids.shape)}"
+            )
+        if not isinstance(numeric_values_by_nin, dict):
+            raise ValueError("numeric_values_by_nin must be a dict")
+        if not isinstance(numeric_valid_positions_by_nin, dict):
+            raise ValueError("numeric_valid_positions_by_nin must be a dict")
+        B, M = cat_local_ids.shape
+        device = cat_local_ids.device
+        if self.cat_mask_local_ids.dim() != 1 or self.cat_mask_local_ids.numel() != M:
+            raise ValueError(
+                f"self.cat_mask_local_ids must be [M] with M={M}, got {tuple(self.cat_mask_local_ids.shape)}"
+            )
+        cat_mask_local_ids = self.cat_mask_local_ids.to(device=device, dtype=cat_local_ids.dtype)
+        # --------------------------------------------------
+        # Random generator
+        # --------------------------------------------------
+        if device.type == "cuda":
+            generator = torch.Generator(device=device)
+        else:
+            generator = torch.Generator()
+        if seed is not None:
+            generator.manual_seed(seed)
+        # --------------------------------------------------
+        # Start from shallow copy only
+        # --------------------------------------------------
+        masked_batch = dict(batch)
+        # Preserve original aliases (do NOT deepcopy)
+        masked_batch["original_cat_local_ids"] = batch["cat_local_ids"]
+        masked_batch["original_cat_valid_positions"] = batch["cat_valid_positions"]
+        masked_batch["original_numeric_values_by_nin"] = batch["numeric_values_by_nin"]
+        masked_batch["original_numeric_valid_positions_by_nin"] = batch["numeric_valid_positions_by_nin"]
+        # --------------------------------------------------
+        # Fast path: no active masking at all
+        # --------------------------------------------------
+        if cat_ratio == 0.0 and num_ratio == 0.0:
+            masked_batch["masked_cat_local_ids"] = batch["cat_local_ids"]
+            masked_batch["masked_cat_valid_positions"] = batch["cat_valid_positions"]
+            masked_batch["masked_numeric_values_by_nin"] = batch["numeric_values_by_nin"]
+            masked_batch["masked_numeric_valid_positions_by_nin"] = batch["numeric_valid_positions_by_nin"]
+            masked_batch["cat_loss_mask"] = torch.zeros(
+                (B, M), dtype=torch.bool, device=device
+            )
+            masked_batch["numeric_loss_mask_by_nin"] = {
+                n_in: torch.zeros_like(valid_positions, dtype=torch.bool)
+                for n_in, valid_positions in numeric_valid_positions_by_nin.items()
+            }
+            return masked_batch
+        # --------------------------------------------------
+        # Categorical masking
+        # --------------------------------------------------
+        original_cat_valid_positions = cat_valid_positions.bool()
+        masked_cat_local_ids = cat_local_ids.clone()
+        masked_cat_valid_positions = original_cat_valid_positions.clone()
+        cat_loss_mask = torch.zeros((B, M), dtype=torch.bool, device=device)
+        if cat_ratio > 0.0:
+            for b in range(B):
+                valid_idx = torch.nonzero(original_cat_valid_positions[b], as_tuple=False).squeeze(1)
+                n_valid = valid_idx.numel()
+                if n_valid == 0:
+                    continue
+                k = int(round(n_valid * cat_ratio))
+                if k <= 0:
+                    continue
+                if k > n_valid:
+                    k = n_valid
+                perm = valid_idx[
+                    torch.randperm(n_valid, generator=generator, device=device)[:k]
+                ]
+                cat_loss_mask[b, perm] = True
+            expanded_cat_mask_ids = cat_mask_local_ids.view(1, M).expand(B, M)
+            masked_cat_local_ids[cat_loss_mask] = expanded_cat_mask_ids[cat_loss_mask]
+            masked_cat_valid_positions = masked_cat_valid_positions & (~cat_loss_mask)
+        masked_batch["masked_cat_local_ids"] = masked_cat_local_ids
+        masked_batch["masked_cat_valid_positions"] = masked_cat_valid_positions
+        masked_batch["cat_loss_mask"] = cat_loss_mask
+        # --------------------------------------------------
+        # Numeric masking
+        # --------------------------------------------------
+        masked_numeric_values_by_nin = {}
+        masked_numeric_valid_positions_by_nin = {}
+        numeric_loss_mask_by_nin = {}
+        # keep deterministic ordering if caller passed mixed int-like keys
+        for n_in in sorted(numeric_values_by_nin.keys(), key=int):
+            values = numeric_values_by_nin[n_in]
+            if n_in not in numeric_valid_positions_by_nin:
+                raise KeyError(f"Missing numeric_valid_positions_by_nin[{n_in}]")
+            valid_positions = numeric_valid_positions_by_nin[n_in]
+            if values.dim() != 3:
+                raise ValueError(
+                    f"numeric_values_by_nin[{n_in}] must be [B, V, n_in], got {tuple(values.shape)}"
+                )
+            Bn, V, Nin = values.shape
+            if Bn != B:
+                raise ValueError(
+                    f"numeric_values_by_nin[{n_in}] batch mismatch: got {Bn}, expected {B}"
+                )
+            if int(Nin) != int(n_in):
+                raise ValueError(
+                    f"numeric_values_by_nin[{n_in}] last dim mismatch: got {Nin}, expected {n_in}"
+                )
+            if valid_positions.shape != (B, V):
+                raise ValueError(
+                    f"numeric_valid_positions_by_nin[{n_in}] must be [B,V]=({B},{V}), "
+                    f"got {tuple(valid_positions.shape)}"
+                )
+            original_valid = valid_positions.bool()
+            # IMPORTANT: clone before modifying
+            masked_values = values.clone()
+            masked_valid_positions = original_valid.clone()
+            num_loss_mask = torch.zeros((B, V), dtype=torch.bool, device=values.device)
+            if num_ratio > 0.0:
+                for b in range(B):
+                    valid_idx = torch.nonzero(original_valid[b], as_tuple=False).squeeze(1)
+                    n_valid = valid_idx.numel()
+                    if n_valid == 0:
+                        continue
+                    k = int(round(n_valid * num_ratio))
+                    if k <= 0:
+                        continue
+                    if k > n_valid:
+                        k = n_valid
+                    perm = valid_idx[
+                        torch.randperm(n_valid, generator=generator, device=values.device)[:k]
+                    ]
+                    num_loss_mask[b, perm] = True
+                # masked numeric columns become zero and invalid
+                masked_values[num_loss_mask] = 0.0
+                masked_valid_positions = masked_valid_positions & (~num_loss_mask)
+            masked_numeric_values_by_nin[n_in] = masked_values
+            masked_numeric_valid_positions_by_nin[n_in] = masked_valid_positions
+            numeric_loss_mask_by_nin[n_in] = num_loss_mask
+        masked_batch["masked_numeric_values_by_nin"] = masked_numeric_values_by_nin
+        masked_batch["masked_numeric_valid_positions_by_nin"] = masked_numeric_valid_positions_by_nin
+        masked_batch["numeric_loss_mask_by_nin"] = numeric_loss_mask_by_nin
+        return masked_batch
+    def perform_active_mask_single(self, batch, feature_name, assert_not_missing=True):
+        """
+        Actively mask exactly one feature specified by feature_name.
+        Parameters
+        ----------
+        batch : dict
+            Same input convention as perform_active_mask(...).
+        feature_name : str
+            Full feature name. Can be either categorical or numeric.
+        assert_not_missing : bool
+            If True, require the target feature to be originally valid for all samples
+            in the batch. Otherwise raise ValueError.
+            If False, only originally valid positions are masked; naturally missing
+            positions remain missing and are not included in the loss mask.
+        Returns
+        -------
+        masked_batch : dict
+            Same output convention as perform_active_mask(...), except that exactly
+            one feature is actively masked.
+        """
+        # --------------------------------------------------
+        # Validate required keys
+        # --------------------------------------------------
+        required_keys = [
+            "cat_local_ids",
+            "cat_valid_positions",
+            "numeric_values_by_nin",
+            "numeric_valid_positions_by_nin",
+        ]
+        for k in required_keys:
+            if k not in batch:
+                raise KeyError(f"Missing key in batch: {k}")
+        cat_local_ids = batch["cat_local_ids"]
+        cat_valid_positions = batch["cat_valid_positions"]
+        numeric_values_by_nin = batch["numeric_values_by_nin"]
+        numeric_valid_positions_by_nin = batch["numeric_valid_positions_by_nin"]
+        if cat_local_ids.dim() != 2:
+            raise ValueError(f"cat_local_ids must be [B, M], got {tuple(cat_local_ids.shape)}")
+        if cat_valid_positions.shape != cat_local_ids.shape:
+            raise ValueError(
+                f"cat_valid_positions must match cat_local_ids shape, got "
+                f"{tuple(cat_valid_positions.shape)} vs {tuple(cat_local_ids.shape)}"
+            )
+        if not isinstance(numeric_values_by_nin, dict):
+            raise ValueError("numeric_values_by_nin must be a dict")
+        if not isinstance(numeric_valid_positions_by_nin, dict):
+            raise ValueError("numeric_valid_positions_by_nin must be a dict")
+        B, M = cat_local_ids.shape
+        device = cat_local_ids.device
+        if self.cat_mask_local_ids.dim() != 1 or self.cat_mask_local_ids.numel() != M:
+            raise ValueError(
+                f"self.cat_mask_local_ids must be [M] with M={M}, got {tuple(self.cat_mask_local_ids.shape)}"
+            )
+        cat_mask_local_ids = self.cat_mask_local_ids.to(device=device, dtype=cat_local_ids.dtype)
+        # --------------------------------------------------
+        # Resolve feature_name -> categorical col or numeric (n_in, v_idx)
+        # --------------------------------------------------
+        # Assumptions:
+        #   - self.cat_vocab is the categorical vocab dict keyed by full feature name
+        #   - self.numeric_vocab contains:
+        #         numeric_vocab["ordered_feature_names"]
+        #         numeric_vocab["features"][name]["n_in"]
+        #         numeric_vocab["features"][name]["col_id"]
+        #
+        # If your actual attribute names differ, only this block needs adaptation.
+        is_cat = False
+        is_num = False
+        cat_col = None
+        num_n_in = None
+        num_v_idx = None
+        # categorical
+        if hasattr(self, "cat_vocab") and feature_name in self.cat_vocab:
+            is_cat = True
+            cat_col = int(self.cat_vocab[feature_name]["col_id"])
+        # numeric
+        if hasattr(self, "numeric_vocab"):
+            num_features = self.numeric_vocab.get("features", {})
+            if feature_name in num_features:
+                is_num = True
+                meta = num_features[feature_name]
+                num_n_in = int(meta["n_in"])
+                num_v_idx = int(meta["col_id"])
+        if is_cat and is_num:
+            raise ValueError(f"Feature name appears in both categorical and numeric vocab: {feature_name}")
+        if not is_cat and not is_num:
+            raise KeyError(f"Unknown feature_name: {feature_name}")
+        # --------------------------------------------------
+        # Start from shallow copy only
+        # --------------------------------------------------
+        masked_batch = dict(batch)
+        # Preserve original aliases (do NOT deepcopy)
+        masked_batch["original_cat_local_ids"] = batch["cat_local_ids"]
+        masked_batch["original_cat_valid_positions"] = batch["cat_valid_positions"]
+        masked_batch["original_numeric_values_by_nin"] = batch["numeric_values_by_nin"]
+        masked_batch["original_numeric_valid_positions_by_nin"] = batch["numeric_valid_positions_by_nin"]
+        # --------------------------------------------------
+        # Default: no masking anywhere
+        # --------------------------------------------------
+        masked_cat_local_ids = batch["cat_local_ids"].clone()
+        masked_cat_valid_positions = batch["cat_valid_positions"].bool().clone()
+        cat_loss_mask = torch.zeros((B, M), dtype=torch.bool, device=device)
+        masked_numeric_values_by_nin = {}
+        masked_numeric_valid_positions_by_nin = {}
+        numeric_loss_mask_by_nin = {}
+        for n_in in sorted(numeric_values_by_nin.keys(), key=int):
+            values = numeric_values_by_nin[n_in]
+            if n_in not in numeric_valid_positions_by_nin:
+                raise KeyError(f"Missing numeric_valid_positions_by_nin[{n_in}]")
+            valid_positions = numeric_valid_positions_by_nin[n_in]
+            if values.dim() != 3:
+                raise ValueError(
+                    f"numeric_values_by_nin[{n_in}] must be [B, V, n_in], got {tuple(values.shape)}"
+                )
+            Bn, V, Nin = values.shape
+            if Bn != B:
+                raise ValueError(
+                    f"numeric_values_by_nin[{n_in}] batch mismatch: got {Bn}, expected {B}"
+                )
+            if int(Nin) != int(n_in):
+                raise ValueError(
+                    f"numeric_values_by_nin[{n_in}] last dim mismatch: got {Nin}, expected {n_in}"
+                )
+            if valid_positions.shape != (B, V):
+                raise ValueError(
+                    f"numeric_valid_positions_by_nin[{n_in}] must be [B,V]=({B},{V}), "
+                    f"got {tuple(valid_positions.shape)}"
+                )
+            masked_numeric_values_by_nin[n_in] = values.clone()
+            masked_numeric_valid_positions_by_nin[n_in] = valid_positions.bool().clone()
+            numeric_loss_mask_by_nin[n_in] = torch.zeros((B, V), dtype=torch.bool, device=values.device)
+        # --------------------------------------------------
+        # Apply single-feature masking
+        # --------------------------------------------------
+        if is_cat:
+            original_valid = cat_valid_positions[:, cat_col].bool()  # [B]
+            if assert_not_missing and not bool(original_valid.all().item()):
+                n_bad = int((~original_valid).sum().item())
+                raise ValueError(
+                    f"Categorical feature '{feature_name}' has {n_bad} naturally missing samples in batch"
+                )
+            # only originally valid positions are actively masked
+            cat_loss_mask[:, cat_col] = original_valid
+            masked_cat_local_ids[cat_loss_mask] = cat_mask_local_ids.view(1, M).expand(B, M)[cat_loss_mask]
+            masked_cat_valid_positions = masked_cat_valid_positions & (~cat_loss_mask)
+        else:
+            if num_n_in not in masked_numeric_values_by_nin:
+                raise KeyError(f"numeric_values_by_nin does not contain n_in={num_n_in} for {feature_name}")
+            values = masked_numeric_values_by_nin[num_n_in]
+            valid_positions = masked_numeric_valid_positions_by_nin[num_n_in]
+            num_loss_mask = numeric_loss_mask_by_nin[num_n_in]
+            if num_v_idx >= values.shape[1]:
+                raise IndexError(
+                    f"Numeric feature '{feature_name}' resolved to v_idx={num_v_idx}, "
+                    f"but numeric_values_by_nin[{num_n_in}] has V={values.shape[1]}"
+                )
+            original_valid = valid_positions[:, num_v_idx].bool()  # [B]
+            if assert_not_missing and not bool(original_valid.all().item()):
+                n_bad = int((~original_valid).sum().item())
+                raise ValueError(
+                    f"Numeric feature '{feature_name}' has {n_bad} naturally missing samples in batch"
+                )
+            # only originally valid positions are actively masked
+            num_loss_mask[:, num_v_idx] = original_valid
+            values[num_loss_mask] = 0.0
+            valid_positions[:] = valid_positions & (~num_loss_mask)
+        # --------------------------------------------------
+        # Finalize outputs
+        # --------------------------------------------------
+        masked_batch["masked_cat_local_ids"] = masked_cat_local_ids
+        masked_batch["masked_cat_valid_positions"] = masked_cat_valid_positions
+        masked_batch["cat_loss_mask"] = cat_loss_mask
+        masked_batch["masked_numeric_values_by_nin"] = masked_numeric_values_by_nin
+        masked_batch["masked_numeric_valid_positions_by_nin"] = masked_numeric_valid_positions_by_nin
+        masked_batch["numeric_loss_mask_by_nin"] = numeric_loss_mask_by_nin
+        return masked_batch
+def build_train_eval_dataloaders(
+        dataset,
+        train_ratio=0.8,
+        seed=42,
+        batch_size=32,
+):
+    n = len(dataset)
+    n_train = int(n * train_ratio)
+    n_eval = n - n_train
+    split_generator = torch.Generator().manual_seed(seed)
+    train_ds, eval_ds = torch.utils.data.random_split(
+        dataset,
+        [n_train, n_eval],
+        generator=split_generator
+    )
+    train_generator = torch.Generator()
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=batch_size,
+        shuffle=True,
+        collate_fn=dataset.collate_fn,
+        generator=train_generator,
+    )
+    eval_loader = DataLoader(
+        eval_ds,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=dataset.collate_fn,
+    )
+    return train_loader, eval_loader, train_generator
+def debug_print_first_sample(dataset, batch, batch_pos=0):
+    """
+    Inspect one sample in a batch.
+    This debug function checks masked_* fields against the original csv row.
+    Positions in loss_mask are allowed to mismatch.
+    Args:
+        dataset: SoilFormerDataset
+        batch: collated + optionally masked batch
+        batch_pos: index inside the batch (not dataset row index)
+    """
+    import math
+    def numeric_list_close(a, b, atol=1e-6, rtol=1e-5):
+        if len(a) != len(b):
+            return False
+        for x, y in zip(a, b):
+            if not math.isclose(float(x), float(y), rel_tol=rtol, abs_tol=atol):
+                return False
+        return True
+    def normalize_numeric_list(feat_name, vals, is_valid):
+        if not is_valid:
+            return [0.0] * len(vals)
+        stat_row = dataset.numeric_stats_index.loc[feat_name]
+        mean = float(stat_row["mean"])
+        std = float(stat_row["std"])
+        if std == 0.0:
+            std = 1.0
+        return [(float(v) - mean) / std for v in vals]
+    if "row_idx" not in batch:
+        raise KeyError("batch must contain 'row_idx' for debug_print_first_sample")
+    if "sample_id" not in batch:
+        raise KeyError("batch must contain 'sample_id' for debug_print_first_sample")
+    row_idx = int(batch["row_idx"][batch_pos].item())
+    row = dataset.df.iloc[row_idx]
+    sample_id = batch["sample_id"][batch_pos]
+    print("\n====================================================")
+    print("DEBUG SAMPLE")
+    print("====================================================")
+    print("batch_pos :", batch_pos)
+    print("row_idx   :", row_idx)
+    print("sample_id :", sample_id)
+    # ====================================================
+    # categorical
+    # ====================================================
+    print("\n[CATEGORICAL FEATURES]")
+    cat_ids = batch["masked_cat_local_ids"][batch_pos]
+    cat_valids = batch["masked_cat_valid_positions"][batch_pos]
+    cat_loss_mask = batch.get("cat_loss_mask", None)
+    if cat_loss_mask is not None:
+        cat_loss_mask = cat_loss_mask[batch_pos]
+    for i, col in enumerate(dataset.cat_columns):
+        raw = row[col]
+        raw_str = str(raw)
+        got_id = int(cat_ids[i].item())
+        got_valid = bool(cat_valids[i].item())
+        spec = dataset.cat_vocab[col]
+        label2id = spec["label2id"]
+        mask_id = int(spec["mask_local_id"])
+        if raw == "":
+            expected_id = mask_id
+            expected_valid = False
+        else:
+            expected_id = int(label2id[raw])
+            expected_valid = True
+        is_loss_position = False
+        if cat_loss_mask is not None:
+            is_loss_position = bool(cat_loss_mask[i].item())
+        if is_loss_position:
+            ok = True
+        else:
+            ok = (got_id == expected_id) and (got_valid == expected_valid)
+        print(
+            f"{i:03d} | {col} | "
+            f"raw={raw_str:<60} | "
+            f"id={got_id:<6} | expected={expected_id:<6} | "
+            f"valid={got_valid} | exp_valid={expected_valid} | "
+            f"loss_mask={is_loss_position} | ok={ok}"
+        )
+        if not ok:
+            raise AssertionError(
+                f"\nCategorical mismatch\n"
+                f"batch_pos={batch_pos}\n"
+                f"row_idx={row_idx}\n"
+                f"feature={col}\n"
+                f"raw={raw}\n"
+                f"id={got_id}, expected={expected_id}\n"
+                f"valid={got_valid}, expected={expected_valid}"
+            )
+    # ====================================================
+    # numeric
+    # ====================================================
+    print("\n[NUMERIC FEATURES]")
+    numeric_loss_mask_by_nin = batch.get("numeric_loss_mask_by_nin", None)
+    for group in dataset.numeric_groups:
+        n_in = int(group["n_in"])
+        features = group["feature_names"]
+        values = batch["masked_numeric_values_by_nin"][n_in][batch_pos]
+        valids = batch["masked_numeric_valid_positions_by_nin"][n_in][batch_pos]
+        if numeric_loss_mask_by_nin is not None:
+            loss_mask = numeric_loss_mask_by_nin[n_in][batch_pos]
+        else:
+            loss_mask = None
+        print(f"\nGroup n_in={n_in}")
+        for i, feat in enumerate(features):
+            raw = row[feat]
+            raw_str = str(raw)
+            parsed, expected_valid = parse_numeric_cell(raw, n_in)
+            expected_norm = normalize_numeric_list(feat, parsed, expected_valid)
+            tensor_val = values[i].tolist()
+            got_valid = bool(valids[i].item())
+            is_loss_position = False
+            if loss_mask is not None:
+                is_loss_position = bool(loss_mask[i].item())
+            if is_loss_position:
+                ok = True
+            else:
+                value_ok = numeric_list_close(tensor_val, expected_norm)
+                valid_ok = (got_valid == expected_valid)
+                ok = value_ok and valid_ok
+            print(
+                f"{i:03d} | {feat} | "
+                f"raw={raw_str:<60} | "
+                f"tensor={tensor_val} | expected_norm={expected_norm} | "
+                f"valid={got_valid} | exp_valid={expected_valid} | "
+                f"loss_mask={is_loss_position} | ok={ok}"
+            )
+            if not ok:
+                raise AssertionError(
+                    f"\nNumeric mismatch\n"
+                    f"batch_pos={batch_pos}\n"
+                    f"row_idx={row_idx}\n"
+                    f"feature={feat}\n"
+                    f"raw={raw}\n"
+                    f"tensor={tensor_val}\n"
+                    f"expected={parsed}\n"
+                    f"valid={got_valid}, expected={expected_valid}"
+                )
+    # ====================================================
+    # vision
+    # ====================================================
+    print("\n[VISION]")
+    try:
+        relative_path = dataset.photo_map[sample_id]
+        expected_path = join_photo_root(dataset.photo_root, relative_path)
+        # Use the same logic as __getitem__: valid only if image can actually be loaded
+        _ = dataset.load_image(expected_path)
+        expected_valid = True
+    except Exception:  # noqa
+        expected_path = None
+        expected_valid = False
+    got_valid = bool(batch["vision_valid_positions"][batch_pos].item())
+    img_shape = tuple(batch["pixel_values"][batch_pos].shape)
+    print("expected_path :", expected_path)
+    print("vision_valid  :", got_valid)
+    print("image_shape   :", img_shape)
+    if got_valid != expected_valid:
+        raise AssertionError(
+            f"\nVision validity mismatch\n"
+            f"batch_pos={batch_pos}\n"
+            f"row_idx={row_idx}\n"
+            f"expected={expected_valid}, got={got_valid}"
+        )
+    print("\n====================================================")
+    print("DEBUG CHECK PASSED")
+    print("====================================================\n")
+def main():
+    dataset = SoilFormerDataset(
+        csv_path="data/tabular_data.csv",
+        photo_map_path="data/photo_map.json",
+        cat_vocab_path="data/cat_vocab.json",
+        numeric_vocab_path="data/numeric_vocab.json",
+        numeric_stats_path="data/tabular_meta_numeric_stats.csv",
+        photo_root="/Volumes/TOSHIBA EXT",
+        image_size=512,
+        id_column="id",
+    )
+    train_loader, eval_loader, train_generator = build_train_eval_dataloaders(dataset)
+    print("Dataset size:", len(dataset))
+    raw_batch = next(iter(eval_loader))
+    batch = dataset.perform_active_mask(
+        raw_batch,
+        cat_ratio=0.15,
+        num_ratio=0.15,
+        seed=42,
+    )
+    print("\nBatch check")
+    if "row_idx" in batch:
+        print("row_idx:", batch["row_idx"].shape, batch["row_idx"].dtype)
+    if "sample_id" in batch:
+        print("sample_id:", len(batch["sample_id"]))
+    print("original_cat_local_ids:", batch["original_cat_local_ids"].shape)
+    print("masked_cat_local_ids:", batch["masked_cat_local_ids"].shape)
+    print("original_cat_valid_positions:", batch["original_cat_valid_positions"].shape)
+    print("masked_cat_valid_positions:", batch["masked_cat_valid_positions"].shape)
+    print("cat_loss_mask:", batch["cat_loss_mask"].shape)
+    for k, v in batch["original_numeric_values_by_nin"].items():
+        print(f"original_numeric_values_by_nin[{k}]:", v.shape)
+    for k, v in batch["masked_numeric_values_by_nin"].items():
+        print(f"masked_numeric_values_by_nin[{k}]:", v.shape)
+    for k, v in batch["original_numeric_valid_positions_by_nin"].items():
+        print(f"original_numeric_valid_positions_by_nin[{k}]:", v.shape)
+    for k, v in batch["masked_numeric_valid_positions_by_nin"].items():
+        print(f"masked_numeric_valid_positions_by_nin[{k}]:", v.shape)
+    for k, v in batch["numeric_loss_mask_by_nin"].items():
+        print(f"numeric_loss_mask_by_nin[{k}]:", v.shape)
+    print("pixel_values:", batch["pixel_values"].shape)
+    print("vision_valid_positions:", batch["vision_valid_positions"].shape)
+    print("\nTensor dtype check")
+    print("masked cat ids dtype:", batch["masked_cat_local_ids"].dtype)
+    print("masked numeric dtype:", next(iter(batch["masked_numeric_values_by_nin"].values())).dtype)
+    print("image dtype:", batch["pixel_values"].dtype)
+    print("\nLoader test finished successfully")
+    debug_print_first_sample(dataset, batch, batch_pos=0)
+if __name__ == "__main__":
+    main()

modelling/soilformer.py ADDED Viewed

	@@ -0,0 +1,696 @@

+# soilformer.py
+# -*- coding: utf-8 -*-
+import json
+import os
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa
+from decode_categorical import CategoricalDecoder
+from decode_numeric import NumericDecoder
+from embed_categorical import (
+    CategoricalEmbedding,
+    build_cat_vocab_spec_from_meta,
+    get_categorical_feature_names_from_meta,
+    save_cat_vocab_json,
+)
+from embed_numeric import (
+    NumericEmbedding,
+    build_numeric_vocab_spec_from_meta,
+)
+from embed_vision_gemma3n import Gemma3nVisionFeatureExtractor
+from layer import TabularImageGQALayer
+from utils import load_json, save_json, get_dtype
+# ============================================================
+# SoilFormer
+# ============================================================
+class SoilFormer(nn.Module):
+    """
+    Full model: embeddings -> TabularImageGQALayer stack -> decoders.
+    """
+    def __init__(self, config: Dict, device: Optional[str] = None):
+        super().__init__()
+        self.config = dict(config)
+        dtype = get_dtype(self.config.get("dtype", "bfloat16"))
+        dev = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
+        # ---- Tabular dims
+        cat_hidden = int(self.config["cat_hidden_size"])
+        num_hidden = int(self.config["numeric_hidden_size"])
+        if cat_hidden != num_hidden:
+            raise ValueError("Expect cat_hidden_size == numeric_hidden_size for one tabular stream.")
+        self.tabular_dim = cat_hidden
+        # ---- Embeddings
+        self.embed_cat = CategoricalEmbedding(
+            hidden_size=cat_hidden,
+            cat_vocab_json=self.config["cat_vocab_json"],
+        )
+        self.embed_num = NumericEmbedding(
+            hidden_size=num_hidden,
+            numeric_vocab_json=self.config["numeric_vocab_json"],
+            middle_size=self.config.get("numeric_encode_middle_size", None),
+        )
+        # ---- Decoders
+        self.decode_cat = CategoricalDecoder(
+            hidden_size=cat_hidden,
+            cat_vocab_json=self.config["cat_vocab_json"],
+            middle_size=self.config.get("cat_decode_middle_size", None),
+            homoscedastic=self.config.get("cat_homoscedastic", True),
+        )
+        self.decode_num = NumericDecoder(
+            hidden_size=num_hidden,
+            numeric_vocab_json=self.config["numeric_vocab_json"],
+            middle_size=self.config.get("numeric_decode_middle_size", None),
+            homoscedastic=self.config.get("num_homoscedastic", True),
+        )
+        # ---- Vision
+        self.vision_extractor = Gemma3nVisionFeatureExtractor.from_pretrained_vision_only_dir(
+            model_dir=self.config["vision_model_dir"],
+            map_location="cpu",
+            num_output_tokens_reduced=self.config["vision_num_output_tokens_reduced"],
+            num_heads_for_token_reduction=self.config["vision_num_heads_for_token_reduction"],
+            reducer_bottleneck_dim=self.config["vision_reducer_bottleneck_dim"],
+            reducer_project_back=self.config["vision_reducer_project_back"],
+        )
+        # ---- Layers
+        L = int(self.config["layer_num_layers"])
+        self.layers = nn.ModuleList([
+            TabularImageGQALayer(
+                tabular_dim=self.tabular_dim,
+                vision_dim=self.vision_extractor.get_actual_hidden_dim(),
+                num_query_heads=int(self.config["layer_num_query_heads"]),
+                num_kv_heads=int(self.config["layer_num_kv_heads"]),
+                head_dim=int(self.config["layer_head_dim"]),
+                mlp_ratio=float(self.config["layer_mlp_ratio"]),
+                dropout=float(self.config["layer_dropout"]),
+            )
+            for _ in range(L)
+        ])
+        # ---- Move
+        self.to(device=dev, dtype=dtype)
+    def init_weights(self, std: float = 0.02):
+        self.embed_cat.init_weights(std=std)
+        self.embed_num.init_weights(std=std)
+        self.decode_cat.init_weights(std=std)
+        self.decode_num.init_weights(std=std)
+        self.vision_extractor.init_weights(std=std)
+        for blk in self.layers:
+            blk.init_weights(std=std)
+    def forward(
+            self,
+            cat_local_ids: torch.LongTensor,  # [B, M_cat]
+            numeric_values_by_nin: Dict[int, torch.Tensor],  # {n_in: [B, V, n_in]}
+            cat_valid_positions: Optional[torch.Tensor] = None,  # [B, M_cat] bool
+            numeric_valid_positions_by_nin: Optional[Dict[int, torch.Tensor]] = None,  # {n_in: [B,V] bool}
+            pixel_values: Optional[torch.Tensor] = None,  # [B, 3, H, W]
+            vision_valid_positions: Optional[torch.Tensor] = None,  # [B] bool OR indices [K]
+    ):
+        # ----------------------------
+        # Embeddings (tabular)
+        # ----------------------------
+        x_cat, cat_mask = self.embed_cat(
+            local_ids=cat_local_ids,
+            valid_positions=cat_valid_positions,
+        )
+        x_num, num_mask = self.embed_num(
+            values_by_nin=numeric_values_by_nin,
+            valid_positions_by_nin=numeric_valid_positions_by_nin,
+        )
+        x_tab = torch.cat([x_cat, x_num], dim=1)  # [B, T_tab, H]
+        B, T_tab, _ = x_tab.shape
+        M_cat = x_cat.size(1)
+        T_num = x_num.size(1)
+        # ----------------------------
+        # Tabular attention mask
+        # ----------------------------
+        cat_mask = cat_mask.to(device=x_tab.device, dtype=torch.long)
+        num_mask = num_mask.to(device=x_tab.device, dtype=torch.long)
+        if self.config["disable_tabular_attention_mask"]:
+            attention_mask_tab = torch.ones(B, T_tab, device=x_tab.device, dtype=torch.long)
+        else:
+            attention_mask_tab = torch.cat([cat_mask, num_mask], dim=1)
+            if attention_mask_tab.shape != (B, T_tab):
+                raise RuntimeError("Internal attention_mask_tab shape mismatch")
+        # ----------------------------
+        # Vision features
+        # ----------------------------
+        if pixel_values is None:
+            vision_features = None
+            vision_mask = None
+        else:
+            vision_features, vision_mask = self.vision_extractor(
+                pixel_values=pixel_values,
+                valid_positions=vision_valid_positions,
+            )
+            if vision_features.shape[0] != B:
+                raise ValueError("vision_features batch mismatch with tabular batch")
+            if vision_mask.shape[0] != B or vision_mask.shape[1] != vision_features.shape[1]:
+                raise ValueError("vision_mask shape mismatch with vision_features")
+            vision_mask = vision_mask.to(
+                device=attention_mask_tab.device,
+                dtype=attention_mask_tab.dtype,
+            )
+        # ----------------------------
+        # Transformer blocks
+        # ----------------------------
+        for blk in self.layers:  # type: TabularImageGQALayer
+            x_tab = blk(
+                x_tab=x_tab,
+                attention_mask=attention_mask_tab,
+                vision_features=vision_features,
+                vision_mask=vision_mask
+            )
+        # ----------------------------
+        # Slice outputs
+        # ----------------------------
+        x_cat_out = x_tab[:, :M_cat, :]
+        x_num_out = x_tab[:, M_cat:M_cat + T_num, :]
+        # ----------------------------
+        # Decode
+        # ----------------------------
+        cat_logits_padded, cat_s, valid_class_mask = self.decode_cat(
+            x_cat_out,
+            return_padded=True,
+        )
+        value_by_nin, s_by_nin = self.decode_num(
+            x_num_out
+        )
+        return cat_logits_padded, cat_s, valid_class_mask, value_by_nin, s_by_nin, x_tab
+    def _checkpoint_state_dict(self) -> Dict[str, torch.Tensor]:
+        """
+        State dict used for save/load.
+        Excludes pretrained frozen vision weights:
+          - vision_extractor.vision_tower.*
+          - vision_extractor.embed_vision.*
+        Keeps reducer weights if reducer exists.
+        """
+        full_sd = self.state_dict()
+        out = {}
+        for k, v in full_sd.items():
+            if k.startswith("vision_extractor.vision_tower."):
+                continue
+            if k.startswith("vision_extractor.embed_vision."):
+                continue
+            out[k] = v
+        return out
+    def save_weights(self, path: str):
+        """
+        Save model weights needed for SoilFormer training/inference,
+        excluding pretrained frozen vision weights.
+        """
+        payload = {
+            "model_state_dict": self._checkpoint_state_dict(),
+            "config": self.config,
+        }
+        torch.save(payload, path)
+    def load_weights(self, path: str, map_location: str = "cpu", strict: bool = True):
+        """
+        Load weights saved by save_weights().
+        Only the checkpoint-managed subset is loaded:
+          - embeddings / decoders / layers
+          - vision_extractor.reducer.* (if present)
+        Pretrained frozen vision weights are ignored here and are expected
+        to come from vision_model_dir during model construction.
+        """
+        ckpt = torch.load(path, map_location=map_location)
+        if isinstance(ckpt, dict) and "model_state_dict" in ckpt:
+            sd = ckpt["model_state_dict"]
+        elif isinstance(ckpt, dict):
+            sd = ckpt
+        else:
+            raise ValueError(f"Unsupported checkpoint format: {path}")
+        expected_sd = self._checkpoint_state_dict()
+        # Only keep keys that belong to the checkpoint-managed subset
+        loadable_sd = {k: v for k, v in sd.items() if k in expected_sd}
+        missing = sorted(set(expected_sd.keys()) - set(loadable_sd.keys()))
+        unexpected = sorted(set(sd.keys()) - set(expected_sd.keys()))
+        # Actually load
+        load_info = self.load_state_dict(loadable_sd, strict=False)
+        # PyTorch may still report missing keys from the full model state_dict;
+        # keep only checkpoint-managed ones.
+        missing_after_load = [
+            k for k in load_info.missing_keys
+            if k in expected_sd
+        ]
+        unexpected_after_load = [
+            k for k in load_info.unexpected_keys
+            if k in expected_sd
+        ]
+        # Merge both sources of mismatch info
+        missing_final = sorted(set(missing) | set(missing_after_load))
+        unexpected_final = sorted(set(unexpected) | set(unexpected_after_load))
+        if strict and (missing_final or unexpected_final):
+            raise RuntimeError(
+                "Checkpoint load mismatch.\n"
+                f"Missing keys: {missing_final}\n"
+                f"Unexpected keys: {unexpected_final}"
+            )
+        return {
+            "missing_keys": missing_final,
+            "unexpected_keys": unexpected_final,
+        }
+def loss_function(
+        x_cat: torch.Tensor,  # [B,M,Cmax] padded logits
+        s_cat: torch.Tensor,  # [B,M]  log-variance
+        y_cat: torch.Tensor,  # [B,M]  class index
+        loss_mask_cat: torch.Tensor,  # [B,M]  0/1
+        valid_class_mask: torch.Tensor,  # [M,Cmax] bool
+        x_num: Dict[int, torch.Tensor],  # {n_in: [B,V,n_in]}
+        s_num: Dict[int, torch.Tensor],  # {n_in: [B,V]}
+        y_num: Dict[int, torch.Tensor],  # {n_in: [B,V,n_in]}
+        loss_mask_num: Dict[int, torch.Tensor],  # {n_in: [B,V]} 0/1
+        cat_temperature: float = 1.0,
+        reduction: str = "mean",  # "mean" or "sum"
+        eps: float = 1e-12,
+        cat_s_bound: Optional[float] = None,
+        num_s_bound: Optional[float] = None,
+) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+    """
+    Strict loss for SoilFormer.
+    Categorical:
+        - Uses per-column CE over the valid class range only.
+        - Does NOT rely on padded logits values.
+        - s_cat[b,m] = log sigma^2 for categorical column m.
+    Numeric:
+        - Per-variable MSE averaged over n_in dimensions.
+        - s_num[n_in][b,v] = log sigma^2 for numeric variable v.
+    Optional soft bound:
+        If cat_s_bound or num_s_bound is not None, apply
+            s <- bound * tanh(s / bound)
+        before using s in heteroscedastic weighting.
+    Returns:
+        total_loss: scalar (float32)
+        stats: dict with cat_loss, num_loss, cat_base, num_base, counts...
+    """
+    def _soft_bound_logvar(s_: torch.Tensor, bound: Optional[float]) -> torch.Tensor:
+        if bound is None:
+            return s_
+        b = float(bound)
+        if b <= 0:
+            # Turn off weighting by signalling a non-positive bound
+            return torch.zeros_like(s_)
+        return b * torch.tanh(s_ / b)
+    # ---------------------------------------------------
+    # 1) Categorical loss (strict per-column CE)
+    # ---------------------------------------------------
+    if x_cat.dim() != 3:
+        raise ValueError(f"x_cat must be [B,M,Cmax], got {tuple(x_cat.shape)}")
+    B, M, Cmax = x_cat.shape
+    if s_cat.shape != (B, M):
+        raise ValueError(f"s_cat must be [B,M]=({B},{M}), got {tuple(s_cat.shape)}")
+    if y_cat.shape != (B, M):
+        raise ValueError(f"y_cat must be [B,M]=({B},{M}), got {tuple(y_cat.shape)}")
+    if loss_mask_cat.shape != (B, M):
+        raise ValueError(f"loss_mask_cat must be [B,M]=({B},{M}), got {tuple(loss_mask_cat.shape)}")
+    if valid_class_mask.shape != (M, Cmax):
+        raise ValueError(
+            f"valid_class_mask must be [M,Cmax]=({M},{Cmax}), got {tuple(valid_class_mask.shape)}"
+        )
+    x_cat_f = x_cat.float()
+    s_cat_f = _soft_bound_logvar(s_cat.float(), cat_s_bound)
+    y_cat_l = y_cat.long()
+    mcat = loss_mask_cat.float()
+    valid_class_mask = valid_class_mask.to(device=x_cat.device, dtype=torch.bool)
+    if cat_temperature != 1.0:
+        x_cat_f = x_cat_f / float(cat_temperature)
+    cat_loss_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
+    cat_base_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
+    cat_correct_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
+    # denominator = number of actively supervised categorical cells
+    cat_denom = mcat.sum().clamp_min(float(eps))
+    for m in range(M):
+        cm = int(valid_class_mask[m].sum().item())  # real class count for column m
+        if cm <= 0:
+            raise ValueError(f"Column {m} has no valid classes")
+        logits_m = x_cat_f[:, m, :cm]  # [B, C_m]
+        target_m = y_cat_l[:, m]  # [B]
+        s_m = s_cat_f[:, m]  # [B]
+        mask_m = mcat[:, m]  # [B]
+        active = mask_m > 0
+        if active.any():
+            tgt_active = target_m[active]
+            if (tgt_active < 0).any() or (tgt_active >= cm).any():
+                raise ValueError(f"y_cat contains invalid class id for categorical column {m}")
+        target_m_safe = target_m.clone()
+        target_m_safe[~active] = 0
+        ce_m = F.cross_entropy(
+            logits_m,
+            target_m_safe,
+            reduction="none",
+        )  # [B], float32
+        # ---------------------------------------------------
+        # accuracy (only count active positions)
+        # ---------------------------------------------------
+        pred_m = logits_m.argmax(dim=-1)  # [B]
+        correct_m = (pred_m == target_m_safe) & active  # [B]
+        cat_correct_acc = cat_correct_acc + correct_m.float().sum()
+        # heteroscedastic weighting: exp(-s) * CE + s
+        L_m = torch.exp(-s_m) * ce_m + s_m  # [B]
+        cat_loss_acc = cat_loss_acc + (L_m * mask_m).sum()
+        cat_base_acc = cat_base_acc + (ce_m * mask_m).sum()
+    if reduction == "mean":
+        cat_loss = cat_loss_acc / cat_denom
+        cat_base = cat_base_acc / cat_denom
+    elif reduction == "sum":
+        cat_loss = cat_loss_acc
+        cat_base = cat_base_acc
+    else:
+        raise ValueError(f"Unsupported reduction: {reduction}")
+    cat_acc = cat_correct_acc / cat_denom
+    # ---------------------------------------------------
+    # 2) Numeric loss (per-variable heteroscedastic MSE)
+    # ---------------------------------------------------
+    num_loss_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
+    num_base_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
+    num_denom_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
+    for n_in, x in x_num.items():
+        if n_in not in y_num or n_in not in s_num or n_in not in loss_mask_num:
+            raise KeyError(f"Missing key n_in={n_in} in y_num/s_num/loss_mask_num")
+        y = y_num[n_in]
+        s = s_num[n_in]
+        m = loss_mask_num[n_in]
+        if x.shape != y.shape:
+            raise ValueError(
+                f"x_num[{n_in}] and y_num[{n_in}] shape mismatch: "
+                f"{tuple(x.shape)} vs {tuple(y.shape)}"
+            )
+        if x.dim() != 3:
+            raise ValueError(f"x_num[{n_in}] must be [B,V,n_in], got {tuple(x.shape)}")
+        Bb, V, Nin = x.shape
+        if Nin != n_in:
+            raise ValueError(f"x_num[{n_in}] last dim mismatch: got {Nin}, expected {n_in}")
+        if s.shape != (Bb, V):
+            raise ValueError(f"s_num[{n_in}] must be [B,V], got {tuple(s.shape)}")
+        if m.shape != (Bb, V):
+            raise ValueError(f"loss_mask_num[{n_in}] must be [B,V], got {tuple(m.shape)}")
+        x_f = x.float()
+        y_f = y.float()
+        s_f = _soft_bound_logvar(s.float(), num_s_bound)
+        m_f = m.float()
+        # base numeric loss per variable: mean over n_in dims
+        mse = (x_f - y_f).pow(2).mean(dim=-1)  # [B,V]
+        # heteroscedastic weighting: exp(-s) * mse + s
+        L = torch.exp(-s_f) * mse + s_f  # [B,V]
+        num_loss_acc = num_loss_acc + (L * m_f).sum()
+        num_base_acc = num_base_acc + (mse * m_f).sum()
+        num_denom_acc = num_denom_acc + m_f.sum()
+    num_denom = num_denom_acc.clamp_min(float(eps))
+    if reduction == "mean":
+        num_loss = num_loss_acc / num_denom
+        num_base = num_base_acc / num_denom
+    elif reduction == "sum":
+        num_loss = num_loss_acc
+        num_base = num_base_acc
+    else:
+        raise ValueError(f"Unsupported reduction: {reduction}")
+    # ---------------------------------------------------
+    # 3) Total
+    # ---------------------------------------------------
+    total = cat_loss + num_loss
+    stats = {
+        "total": total.detach(),
+        "cat_loss": cat_loss.detach(),
+        "num_loss": num_loss.detach(),
+        "cat_base": cat_base.detach(),
+        "num_base": num_base.detach(),
+        "cat_count": cat_denom.detach(),
+        "num_count": num_denom.detach(),
+        "cat_acc": cat_acc.detach(),
+    }
+    return total, stats
+# ============================================================
+# DEMO
+# ============================================================
+def _demo_main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_json", type=str, default="config/config_model.json")
+    parser.add_argument("--batch_size", type=int, default=2)
+    parser.add_argument("--with_vision", action="store_true")
+    args = parser.parse_args()
+    cfg = load_json(args.config_json)
+    print("===== Loaded config =====")
+    print(json.dumps(cfg, ensure_ascii=False, indent=2))
+    # --------------------------------------------------
+    # Ensure vocab files exist
+    # --------------------------------------------------
+    tabular_meta = load_json(cfg["tabular_meta"])
+    if not os.path.isfile(cfg["cat_vocab_json"]):
+        cat_names = get_categorical_feature_names_from_meta(tabular_meta)
+        vocab = build_cat_vocab_spec_from_meta(tabular_meta, cat_names)
+        Path(cfg["cat_vocab_json"]).parent.mkdir(parents=True, exist_ok=True)
+        save_cat_vocab_json(vocab, cfg["cat_vocab_json"])
+        print(f"[demo] Built cat_vocab_json at {cfg['cat_vocab_json']}")
+    if not os.path.isfile(cfg["numeric_vocab_json"]):
+        spec = build_numeric_vocab_spec_from_meta(tabular_meta)
+        Path(cfg["numeric_vocab_json"]).parent.mkdir(parents=True, exist_ok=True)
+        save_json(spec, cfg["numeric_vocab_json"])
+        print(f"[demo] Built numeric_vocab_json at {cfg['numeric_vocab_json']}")
+    # --------------------------------------------------
+    # Build model
+    # --------------------------------------------------
+    model = SoilFormer(cfg)
+    model.init_weights()
+    model.eval()
+    device = next(model.parameters()).device
+    dtype = next(model.parameters()).dtype
+    B = args.batch_size
+    # --------------------------------------------------
+    # Build dummy categorical inputs
+    # --------------------------------------------------
+    cat_spec = load_json(cfg["cat_vocab_json"])
+    cat_items = sorted(cat_spec.items(), key=lambda x: x[1]["col_id"])
+    M_cat = len(cat_items)
+    cat_local_ids = torch.zeros(B, M_cat, dtype=torch.long, device=device)
+    cat_valid_positions = torch.ones(B, M_cat, dtype=torch.bool, device=device)
+    # --------------------------------------------------
+    # Build dummy numeric inputs
+    # --------------------------------------------------
+    num_spec = load_json(cfg["numeric_vocab_json"])
+    numeric_values_by_nin: Dict[int, torch.Tensor] = {}
+    numeric_valid_positions_by_nin: Dict[int, torch.Tensor] = {}
+    for g in num_spec["groups"]:
+        n_in = int(g["n_in"])
+        V = len(g["feature_names"])
+        numeric_values_by_nin[n_in] = torch.randn(B, V, n_in, device=device, dtype=dtype)
+        numeric_valid_positions_by_nin[n_in] = torch.ones(B, V, dtype=torch.bool, device=device)
+    # --------------------------------------------------
+    # Build dummy vision inputs
+    # --------------------------------------------------
+    if args.with_vision:
+        pixel_values = torch.randn(B, 3, 224, 224, device=device, dtype=dtype)
+        vision_valid_positions = torch.ones(B, dtype=torch.bool, device=device)
+    else:
+        pixel_values = None
+        vision_valid_positions = None
+    # --------------------------------------------------
+    # Vision debug
+    # --------------------------------------------------
+    print("\n===== Vision debug =====")
+    if pixel_values is None:
+        print("pixel_values: None")
+        print("vision_features: None")
+        print("vision_mask: None")
+    else:
+        print("pixel_values:", tuple(pixel_values.shape), pixel_values.dtype, pixel_values.device)
+        with torch.no_grad():
+            vision_features, vision_mask = model.vision_extractor.forward(
+                pixel_values=pixel_values,
+                valid_positions=vision_valid_positions,
+            )
+        print("vision_features:", tuple(vision_features.shape), vision_features.dtype, vision_features.device)
+        print("vision_mask:", tuple(vision_mask.shape), vision_mask.dtype, vision_mask.device)
+    # --------------------------------------------------
+    # Forward
+    # --------------------------------------------------
+    with torch.no_grad():
+        cat_logits_padded, cat_s, valid_class_mask, value_by_nin, s_by_nin, x_tab = model.forward(
+            cat_local_ids=cat_local_ids,  # noqa
+            numeric_values_by_nin=numeric_values_by_nin,
+            cat_valid_positions=cat_valid_positions,
+            numeric_valid_positions_by_nin=numeric_valid_positions_by_nin,
+            pixel_values=pixel_values,
+            vision_valid_positions=vision_valid_positions,
+        )
+    print("\n===== SoilFormer demo =====")
+    print("cat_local_ids:", tuple(cat_local_ids.shape))
+    print("cat_valid_positions:", tuple(cat_valid_positions.shape))
+    print("numeric_values_by_nin:", {k: tuple(v.shape) for k, v in numeric_values_by_nin.items()})
+    print("numeric_valid_positions_by_nin:", {k: tuple(v.shape) for k, v in numeric_valid_positions_by_nin.items()})
+    print("x_tab_final:", tuple(x_tab.shape), x_tab.dtype, x_tab.device)
+    print("Categorical outputs:")
+    print("cat_logits_padded:", tuple(cat_logits_padded.shape), cat_logits_padded.dtype, cat_logits_padded.device)
+    print("cat_s:", tuple(cat_s.shape), cat_s.dtype, cat_s.device)
+    print("Numeric decoded values:", {k: tuple(v.shape) for k, v in value_by_nin.items()})
+    print("Numeric decoded s:", {k: tuple(s.shape) for k, s in s_by_nin.items()})
+    # --------------------------------------------------
+    # Loss debug
+    # --------------------------------------------------
+    print("\n===== Loss debug =====")
+    if cat_logits_padded.dim() != 3:
+        raise RuntimeError(f"cat_logits_padded must be [B,M,Cmax], got {tuple(cat_logits_padded.shape)}")
+    B_logits, M_cat2, Cmax2 = cat_logits_padded.shape
+    if cat_s.shape != (B_logits, M_cat2):
+        raise RuntimeError(f"cat_s shape mismatch: got {tuple(cat_s.shape)} expected {(B_logits, M_cat2)}")
+    # Build dummy categorical targets within valid class ranges
+    num_classes = [int(s["num_classes"]) for _, s in cat_items]
+    if len(num_classes) != M_cat2:
+        raise RuntimeError("M_cat mismatch between vocab and model output")
+    y_cat = torch.zeros(B_logits, M_cat2, dtype=torch.long, device=device)
+    for m, cm in enumerate(num_classes):
+        y_cat[:, m] = torch.randint(low=0, high=cm, size=(B_logits,), device=device)
+    mask_cat = torch.ones(B_logits, M_cat2, dtype=torch.long, device=device)
+    # Build dummy numeric targets and masks
+    y_num = {
+        n_in: torch.randn_like(x_pred)
+        for n_in, x_pred in value_by_nin.items()
+    }
+    mask_num = {
+        n_in: torch.ones(x_pred.size(0), x_pred.size(1), dtype=torch.long, device=x_pred.device)
+        for n_in, x_pred in value_by_nin.items()
+    }
+    total_loss, stats = loss_function(
+        x_cat=cat_logits_padded,
+        s_cat=cat_s,
+        y_cat=y_cat,
+        loss_mask_cat=mask_cat,
+        x_num=value_by_nin,
+        s_num=s_by_nin,
+        y_num=y_num,
+        loss_mask_num=mask_num,
+        reduction="mean",
+        valid_class_mask=valid_class_mask
+    )
+    print("total_loss:", float(total_loss))
+    print("stats:", {k: float(v) for k, v in stats.items()})
+    if not torch.isfinite(total_loss):
+        raise RuntimeError("Loss is not finite!")
+if __name__ == "__main__":
+    _demo_main()

modelling/train.py ADDED Viewed

	@@ -0,0 +1,552 @@

+import argparse
+import json
+import os
+import random
+from pathlib import Path
+from typing import Dict, Optional
+import numpy as np
+import torch
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR, LinearLR, SequentialLR
+from tqdm import tqdm
+from loader import SoilFormerDataset, build_train_eval_dataloaders
+from soilformer import SoilFormer, loss_function
+from utils import get_dtype, load_json, save_json
+try:
+    import wandb
+except ImportError:  # pragma: no cover
+    wandb = None
+def set_seed(seed: int, deterministic: bool = True) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def resolve_device(device_str: str) -> torch.device:
+    device_str = device_str.lower()
+    if device_str == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError("config requests cuda, but CUDA is not available")
+        return torch.device("cuda")
+    if device_str == "mps":
+        if not torch.backends.mps.is_available():
+            raise RuntimeError("config requests mps, but MPS is not available")
+        return torch.device("mps")
+    if device_str == "cpu":
+        return torch.device("cpu")
+    raise ValueError(f"Unsupported device: {device_str}")
+def move_batch_to_device(batch: Dict, device: torch.device, float_dtype: torch.dtype) -> Dict:
+    out = {}
+    for key, value in batch.items():
+        if isinstance(value, torch.Tensor):
+            if value.dtype.is_floating_point:
+                out[key] = value.to(device=device, dtype=float_dtype, non_blocking=True)
+            else:
+                out[key] = value.to(device=device, non_blocking=True)
+        elif isinstance(value, dict):
+            sub = {}
+            for sub_key, sub_value in value.items():
+                if isinstance(sub_value, torch.Tensor):
+                    if sub_value.dtype.is_floating_point:
+                        sub[sub_key] = sub_value.to(device=device, dtype=float_dtype, non_blocking=True)
+                    else:
+                        sub[sub_key] = sub_value.to(device=device, non_blocking=True)
+                else:
+                    sub[sub_key] = sub_value
+            out[key] = sub
+        else:
+            out[key] = value
+    return out
+def build_scheduler(
+        optimizer: torch.optim.Optimizer,
+        scheduler_cfg: Dict,
+):
+    scheduler_type = str(scheduler_cfg.get("type", "none")).lower()
+    if scheduler_type == "none":
+        return None
+    warmup_epochs = int(scheduler_cfg.get("warmup_epochs", 0))
+    warmup_start_factor = float(scheduler_cfg.get("warmup_start_factor", 0.1))
+    if scheduler_type == "cosine":
+        total_epochs = int(scheduler_cfg["total_epochs"])
+        eta_min = float(scheduler_cfg.get("eta_min", 1e-6))
+        if warmup_epochs > 0:
+            t_max = int(scheduler_cfg.get("t_max", total_epochs - warmup_epochs))
+            if t_max <= 0:
+                raise ValueError(
+                    f"Invalid cosine scheduler config: total_epochs={total_epochs}, "
+                    f"warmup_epochs={warmup_epochs}, resulting T_max={t_max}"
+                )
+        else:
+            t_max = int(scheduler_cfg.get("t_max", total_epochs))
+        main_scheduler = CosineAnnealingLR(
+            optimizer,
+            T_max=t_max,
+            eta_min=eta_min,
+        )
+    elif scheduler_type == "step":
+        step_size = int(scheduler_cfg["step_size"])
+        gamma = float(scheduler_cfg.get("gamma", 0.1))
+        main_scheduler = StepLR(
+            optimizer,
+            step_size=step_size,
+            gamma=gamma,
+        )
+    else:
+        raise ValueError(f"Unsupported scheduler type: {scheduler_type}")
+    if warmup_epochs <= 0:
+        return main_scheduler
+    warmup_scheduler = LinearLR(
+        optimizer,
+        start_factor=warmup_start_factor,
+        total_iters=warmup_epochs,
+    )
+    scheduler = SequentialLR(
+        optimizer,
+        schedulers=[warmup_scheduler, main_scheduler],
+        milestones=[warmup_epochs],
+    )
+    return scheduler
+def get_checkpoint_model_state(model: SoilFormer) -> Dict[str, torch.Tensor]:
+    if hasattr(model, "_checkpoint_state_dict"):
+        return model._checkpoint_state_dict()  # noqa
+    return model.state_dict()
+def load_checkpoint_model_state(model: SoilFormer, state_dict: Dict[str, torch.Tensor]) -> None:
+    if hasattr(model, "load_weights"):
+        payload = {"model_state_dict": state_dict}
+        tmp_path = None
+        try:
+            import tempfile
+            with tempfile.NamedTemporaryFile(suffix=".pt", delete=False) as f:
+                tmp_path = f.name
+            torch.save(payload, tmp_path)
+            model.load_weights(tmp_path, map_location="cpu", strict=True)
+        finally:
+            if tmp_path is not None and os.path.exists(tmp_path):
+                os.remove(tmp_path)
+        return
+    model.load_state_dict(state_dict, strict=True)
+def save_checkpoint(
+        checkpoint_path: Path,
+        model: SoilFormer,
+        optimizer: torch.optim.Optimizer,
+        scheduler,
+        epoch: int,
+        global_step: int,
+        config_train: Dict,
+        config_model: Dict,
+        config_data: Dict,
+) -> None:
+    checkpoint = {
+        "epoch": epoch,
+        "global_step": global_step,
+        "model_state_dict": get_checkpoint_model_state(model),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "scheduler_state_dict": None if scheduler is None else scheduler.state_dict(),
+        "config_train": config_train,
+        "config_model": config_model,
+        "config_data": config_data,
+    }
+    checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
+    torch.save(checkpoint, checkpoint_path)
+def rotate_checkpoints(checkpoint_dir: Path, max_saved_checkpoints: int) -> None:
+    checkpoint_paths = sorted(checkpoint_dir.glob("checkpoint_epoch_*.pt"))
+    if max_saved_checkpoints is None or max_saved_checkpoints <= 0:
+        return
+    while len(checkpoint_paths) > max_saved_checkpoints:
+        oldest = checkpoint_paths.pop(0)
+        oldest.unlink(missing_ok=True)
+def compute_loss_from_batch(
+        model: SoilFormer,
+        batch: Dict,
+        device: torch.device,
+        dtype: torch.dtype,
+        cat_s_bound: Optional[float] = None,
+        num_s_bound: Optional[float] = None,
+):
+    batch = move_batch_to_device(batch, device=device, float_dtype=dtype)
+    cat_logits_padded, cat_s, valid_class_mask, value_by_nin, s_by_nin, _ = model(
+        cat_local_ids=batch["masked_cat_local_ids"],
+        numeric_values_by_nin=batch["masked_numeric_values_by_nin"],
+        cat_valid_positions=batch["masked_cat_valid_positions"],
+        numeric_valid_positions_by_nin=batch["masked_numeric_valid_positions_by_nin"],
+        pixel_values=batch["pixel_values"],
+        vision_valid_positions=batch["vision_valid_positions"],
+    )
+    total_loss, stats = loss_function(
+        x_cat=cat_logits_padded,
+        s_cat=cat_s,
+        y_cat=batch["original_cat_local_ids"],
+        loss_mask_cat=batch["cat_loss_mask"],
+        valid_class_mask=valid_class_mask,
+        x_num=value_by_nin,
+        s_num=s_by_nin,
+        y_num=batch["original_numeric_values_by_nin"],
+        loss_mask_num=batch["numeric_loss_mask_by_nin"],
+        reduction="mean",
+        cat_s_bound=cat_s_bound,
+        num_s_bound=num_s_bound,
+    )
+    return total_loss, stats
+@torch.no_grad()
+def evaluate(
+        model: SoilFormer,
+        dataset: SoilFormerDataset,
+        eval_loader,
+        device: torch.device,
+        dtype: torch.dtype,
+        cat_mask_ratio: float,
+        num_mask_ratio: float,
+        active_mask_seed: int,
+        show_tqdm: bool,
+        epoch: int,
+        cat_s_bound: Optional[float] = None,
+        num_s_bound: Optional[float] = None,
+):
+    model.eval()
+    totals = {
+        "total": 0.0,
+        "cat_loss": 0.0,
+        "num_loss": 0.0,
+        "cat_base": 0.0,
+        "num_base": 0.0,
+        "cat_acc": 0.0,
+    }
+    num_batches = 0
+    iterator = eval_loader
+    if show_tqdm:
+        iterator = tqdm(eval_loader, desc=f"Eval {epoch}", leave=False)
+    for batch_idx, raw_batch in enumerate(iterator):
+        mask_seed = int(active_mask_seed + batch_idx)
+        masked_batch = dataset.perform_active_mask(
+            raw_batch,
+            cat_ratio=cat_mask_ratio,
+            num_ratio=num_mask_ratio,
+            seed=mask_seed,
+        )
+        _, stats = compute_loss_from_batch(
+            model=model,
+            batch=masked_batch,
+            device=device,
+            dtype=dtype,
+            cat_s_bound=cat_s_bound,
+            num_s_bound=num_s_bound,
+        )
+        num_batches += 1
+        for key in totals:
+            totals[key] += float(stats[key].item())
+    if num_batches == 0:
+        raise RuntimeError("Eval dataloader is empty")
+    return {f"eval/{k}": v / num_batches for k, v in totals.items()}
+def maybe_init_wandb(config_train: Dict):
+    wandb_cfg = config_train["logging"]["wandb"]
+    if not bool(wandb_cfg.get("enabled", False)):
+        return None
+    if wandb is None:
+        raise ImportError("wandb is enabled in config but package is not installed")
+    run = wandb.init(
+        project=wandb_cfg["project"],
+        entity=wandb_cfg.get("entity"),
+        name=wandb_cfg.get("run_name"),
+        dir=wandb_cfg.get("dir"),
+        config=config_train,
+        mode=wandb_cfg.get("mode", "online"),
+    )
+    return run
+def print_parameter_stats(model):
+    total = 0
+    trainable = 0
+    for p in model.parameters():
+        num = p.numel()
+        total += num
+        if p.requires_grad:
+            trainable += num
+    print("\nParameter statistics:")
+    print(f"Total parameters: {total:,}")
+    print(f"Trainable parameters: {trainable:,}")
+    print(f"Frozen parameters: {total - trainable:,}\n")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="config/config_train.json")
+    args = parser.parse_args()
+    config_train = load_json(args.config)
+    config_paths = config_train["paths"]
+    config_data = load_json(config_paths["config_data_path"])
+    config_model = load_json(config_paths["config_model_path"])
+    seed_cfg = config_train["seed"]
+    runtime_cfg = config_train["runtime"]
+    optim_cfg = config_train["optimization"]
+    checkpoint_cfg = config_train["checkpoint"]
+    logging_cfg = config_train["logging"]
+    loss_cfg = config_train["loss"]
+    set_seed(int(seed_cfg["seed"]), deterministic=bool(seed_cfg.get("deterministic", True)))
+    device = resolve_device(runtime_cfg["device"])
+    dtype = get_dtype(config_model.get("dtype", "bfloat16"))
+    output_dir = Path(config_paths["output_dir"])
+    checkpoint_dir = output_dir / "checkpoints"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    save_json(config_train, str(output_dir / "config_train.snapshot.json"))
+    save_json(config_data, str(output_dir / "config_data.snapshot.json"))
+    save_json(config_model, str(output_dir / "config_model.snapshot.json"))
+    dataset = SoilFormerDataset(
+        csv_path=config_data["data_csv_path"],
+        photo_map_path=config_data["photo_map_path"],
+        cat_vocab_path=config_data["cat_vocab_path"],
+        numeric_vocab_path=config_data["numeric_vocab_path"],
+        numeric_stats_path=config_data["numeric_stats_path"],
+        photo_root=config_data["photo_root"],
+        image_size=int(config_data["image_size"]),
+    )
+    train_loader, eval_loader, train_generator = build_train_eval_dataloaders(
+        dataset=dataset,
+        train_ratio=float(config_data["train_ratio"]),
+        seed=int(config_data["train_eval_split_seed"]),
+        batch_size=int(config_data["batch_size"]),
+    )
+    print("\nSample statistics:")
+    print("Train samples:", len(train_loader.dataset))
+    print("Eval samples:", len(eval_loader.dataset))
+    train_generator.manual_seed(int(seed_cfg["seed"]))
+    model = SoilFormer(config=config_model, device=str(device))
+    resume_path = checkpoint_cfg.get("resume_checkpoint_path")
+    if resume_path:
+        checkpoint = torch.load(resume_path, map_location="cpu")
+        load_checkpoint_model_state(model, checkpoint["model_state_dict"])
+    else:
+        model.init_weights(std=float(runtime_cfg.get("init_weight_std", 0.02)))
+        checkpoint = None
+    print_parameter_stats(model)
+    optimizer = AdamW(
+        [p for p in model.parameters() if p.requires_grad],
+        lr=float(optim_cfg["lr"]),
+        betas=(float(optim_cfg["beta1"]), float(optim_cfg["beta2"])),
+        eps=float(optim_cfg["eps"]),
+        weight_decay=float(optim_cfg["weight_decay"]),
+    )
+    scheduler = build_scheduler(
+        optimizer=optimizer,
+        scheduler_cfg=optim_cfg.get("scheduler", {"type": "none"})
+    )
+    start_epoch = 1
+    global_step = 0
+    if checkpoint is not None:
+        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        if scheduler is not None and checkpoint.get("scheduler_state_dict") is not None:
+            scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
+        start_epoch = int(checkpoint["epoch"]) + 1
+        global_step = int(checkpoint.get("global_step", 0))
+    wandb_run = maybe_init_wandb(config_train)
+    num_epochs = int(runtime_cfg["num_epochs"])
+    show_tqdm = bool(logging_cfg.get("tqdm", True))
+    cat_mask_ratio = float(config_data["cat_mask_ratio"])
+    num_mask_ratio = float(config_data["num_mask_ratio"])
+    active_mask_seed = int(config_data["active_mask_seed"])
+    max_grad_norm = optim_cfg.get("max_grad_norm")
+    epochs_per_save = int(checkpoint_cfg["epochs_per_save"])
+    max_saved_checkpoints = int(checkpoint_cfg["max_saved_checkpoints"])
+    for epoch in range(start_epoch, num_epochs + 1):
+        model.train()
+        epoch_totals = {
+            "total": 0.0,
+            "cat_loss": 0.0,
+            "num_loss": 0.0,
+            "cat_base": 0.0,
+            "num_base": 0.0,
+            "cat_acc": 0.0,
+        }
+        num_batches = 0
+        iterator = train_loader
+        if show_tqdm:
+            iterator = tqdm(train_loader, desc=f"Train {epoch}", leave=True)
+        for batch_idx, raw_batch in enumerate(iterator):
+            global_step += 1
+            mask_seed = int(active_mask_seed + epoch * 1_000_000 + batch_idx)
+            masked_batch = dataset.perform_active_mask(
+                raw_batch,
+                cat_ratio=cat_mask_ratio,
+                num_ratio=num_mask_ratio,
+                seed=mask_seed,
+            )
+            optimizer.zero_grad(set_to_none=True)
+            total_loss, stats = compute_loss_from_batch(
+                model=model,
+                batch=masked_batch,
+                device=device,
+                dtype=dtype,
+                cat_s_bound=loss_cfg.get("cat_s_bound", None),
+                num_s_bound=loss_cfg.get("num_s_bound", None),
+            )
+            total_loss.backward()
+            if max_grad_norm is not None:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), float(max_grad_norm))
+            optimizer.step()
+            num_batches += 1
+            for key in epoch_totals:
+                epoch_totals[key] += float(stats[key].item())
+            current_lr = float(optimizer.param_groups[0]["lr"])
+            train_step_log = {
+                "train/step_total": float(stats["total"].item()),
+                "train/step_cat_loss": float(stats["cat_loss"].item()),
+                "train/step_num_loss": float(stats["num_loss"].item()),
+                "train/step_cat_acc": float(stats["cat_acc"].item()),
+                "train/lr": current_lr,
+                "epoch": epoch,
+                "global_step": global_step,
+            }
+            if wandb_run is not None:
+                wandb.log(train_step_log, step=global_step)
+            if show_tqdm:
+                iterator.set_postfix(
+                    loss=f"{train_step_log['train/step_total']:.4f}",
+                    lr=f"{current_lr:.3e}",
+                )
+        if num_batches == 0:
+            raise RuntimeError("Train dataloader is empty")
+        train_epoch_log = {f"train/{k}": v / num_batches for k, v in epoch_totals.items()}
+        train_epoch_log["train/lr_epoch_end"] = float(optimizer.param_groups[0]["lr"])
+        train_epoch_log["epoch"] = epoch
+        train_epoch_log["global_step"] = global_step
+        eval_log = evaluate(
+            model=model,
+            dataset=dataset,
+            eval_loader=eval_loader,
+            device=device,
+            dtype=dtype,
+            cat_mask_ratio=cat_mask_ratio,
+            num_mask_ratio=num_mask_ratio,
+            active_mask_seed=active_mask_seed,
+            show_tqdm=show_tqdm,
+            epoch=epoch,
+            cat_s_bound=loss_cfg.get("cat_s_bound", None),
+            num_s_bound=loss_cfg.get("num_s_bound", None),
+        )
+        eval_log["epoch"] = epoch
+        eval_log["global_step"] = global_step
+        merged_log = {}
+        merged_log.update(train_epoch_log)
+        merged_log.update(eval_log)
+        print(json.dumps(merged_log, ensure_ascii=False))
+        if wandb_run is not None:
+            wandb.log(merged_log, step=global_step)
+        if scheduler is not None:
+            scheduler.step()
+        if epochs_per_save > 0 and epoch % epochs_per_save == 0:
+            checkpoint_path = checkpoint_dir / f"checkpoint_epoch_{epoch}.pt"
+            save_checkpoint(
+                checkpoint_path=checkpoint_path,
+                model=model,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                epoch=epoch,
+                global_step=global_step,
+                config_train=config_train,
+                config_model=config_model,
+                config_data=config_data,
+            )
+            rotate_checkpoints(checkpoint_dir, max_saved_checkpoints)
+    if wandb_run is not None:
+        wandb.finish()
+if __name__ == "__main__":
+    main()

modelling/utils.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# utils.py
+# -*- coding: utf-8 -*-
+import json
+from typing import Dict
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa
+class GroupedMLP(nn.Module):
+    """
+    Batched per-variable MLP for a fixed n_in bucket.
+    Input:  X [B, V, n_in]
+    Output: Y [B, V, n_out]
+    Per-variable weights (NOT shared across V):
+      - 1-layer: W [V, n_out, n_in], b [V, n_out]
+      - 2-layer: W1 [V, mid, n_in], b1 [V, mid]
+                W2 [V, n_out, mid], b2 [V, n_out]
+    """
+    def __init__(
+            self,
+            n_var: int,
+            n_in: int,
+            n_out: int,
+            middle_size: Optional[int] = None,
+            bias: bool = True,
+    ):
+        super().__init__()
+        self.n_var = int(n_var)
+        self.n_in = int(n_in)
+        self.n_out = int(n_out)
+        self.middle_size = None if middle_size is None else int(middle_size)
+        self.bias = bias
+        if self.middle_size is None:
+            self.W = nn.Parameter(torch.empty(self.n_var, self.n_out, self.n_in))
+            if bias:
+                self.b = nn.Parameter(torch.empty(self.n_var, self.n_out))
+            else:
+                self.register_parameter("b", None)
+            self.W1 = self.b1 = self.W2 = self.b2 = None
+        else:
+            mid = self.middle_size
+            self.W1 = nn.Parameter(torch.empty(self.n_var, mid, self.n_in))
+            self.W2 = nn.Parameter(torch.empty(self.n_var, self.n_out, mid))
+            if bias:
+                self.b1 = nn.Parameter(torch.empty(self.n_var, mid))
+                self.b2 = nn.Parameter(torch.empty(self.n_var, self.n_out))
+            else:
+                self.register_parameter("b1", None)
+                self.register_parameter("b2", None)
+            self.W = self.b = None
+    def init_weights(self, std: float = 0.02) -> None:
+        """
+        Initialize weights manually.
+        """
+        if self.middle_size is None:
+            nn.init.normal_(self.W, std=std)
+            if self.bias:
+                nn.init.zeros_(self.b)
+        else:
+            nn.init.normal_(self.W1, std=std)
+            nn.init.normal_(self.W2, std=std)
+            if self.bias:
+                nn.init.zeros_(self.b1)
+                nn.init.zeros_(self.b2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() != 3:
+            raise ValueError(f"Expected x [B,V,n_in], got {tuple(x.shape)}")
+        B, V, I = x.shape
+        if V != self.n_var or I != self.n_in:
+            raise ValueError(
+                f"Shape mismatch: expected V={self.n_var}, n_in={self.n_in}; got V={V}, n_in={I}"
+            )
+        if self.middle_size is None:
+            y = torch.einsum("bvi,voi->bvo", x, self.W)
+            if self.bias:
+                y = y + self.b.unsqueeze(0)
+            return y
+        h = torch.einsum("bvi,vmi->bvm", x, self.W1)
+        if self.bias:
+            h = h + self.b1.unsqueeze(0)
+        h = F.gelu(h)
+        y = torch.einsum("bvm,vom->bvo", h, self.W2)
+        if self.bias:
+            y = y + self.b2.unsqueeze(0)
+        return y
+def get_dtype(dtype: Optional[str]) -> torch.dtype:
+    dtype_str = (dtype or "bfloat16").lower()
+    dtype_map = {
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+        "float32": torch.float32,
+    }
+    if dtype_str not in dtype_map:
+        raise ValueError(f"Unsupported dtype={dtype}. Choose from {list(dtype_map.keys())}")
+    return dtype_map[dtype_str]
+def load_json(path: str):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def save_json(obj: Dict, path: str) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)  # noqa

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch~=2.10.0
+numpy~=2.3.4
+wandb~=0.25.1
+tqdm~=4.67.1
+pandas~=2.3.3
+requests~=2.32.5
+pillow~=12.0.0
+torchvision~=0.25.0
+safetensors~=0.7.0
+transformers~=5.2.0

resources/arch.png ADDED Viewed

Git LFS Details

SHA256: cd0891f93c9b4970faeb6603ebcba7a07f8f41ff35a72de932ba0c3486187259
Pointer size: 131 Bytes
Size of remote file: 421 kB