Upload ViSoNorm trained model

Browse files

Files changed (10) hide show

README.md +117 -0
added_tokens.json +3 -0
config.json +25 -0
pytorch_model.bin +3 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +15 -0
state_dict_report.json +56 -0
tokenizer_config.json +19 -0
training_args.json +10 -0
visonorm_visobert_model.py +557 -0

README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+# ViSoNorm: Vietnamese Text Normalization Model
+ViSoNorm is a state-of-the-art Vietnamese text normalization model that converts informal, non-standard Vietnamese text into standard Vietnamese. The model uses a multi-task learning approach with NSW (Non-Standard Word) detection, mask prediction, and lexical normalization heads.
+## Model Architecture
+- **Base Model**: ViSoBERT (Vietnamese Social Media BERT)
+- **Multi-task Heads**:
+  - NSW Detection: Identifies tokens that need normalization
+  - Mask Prediction: Determines how many masks to add for multi-token expansions
+  - Lexical Normalization: Predicts normalized tokens
+## Features
+- **Self-contained inference**: Built-in `normalize_text` method
+- **NSW detection**: Built-in `detect_nsw` method for detailed analysis
+- **HuggingFace compatible**: Works seamlessly with `AutoModelForMaskedLM`
+- **Production ready**: No hardcoded patterns, works for any Vietnamese text
+- **Multi-token expansion**: Handles cases like "sv" → "sinh viên", "ctrai" → "con trai"
+- **Confidence scoring**: Provides confidence scores for NSW detection and normalization
+## Installation
+```bash
+pip install transformers torch
+```
+## Usage
+### Basic Usage
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+# Load model and tokenizer
+model_repo = "hadung1802/visobert-normalizer"
+tokenizer = AutoTokenizer.from_pretrained(model_repo)
+model = AutoModelForMaskedLM.from_pretrained(model_repo, trust_remote_code=True)
+# Normalize text
+text = "sv dh gia dinh chua cho di lam :))"
+normalized_text, source_tokens, predicted_tokens = model.normalize_text(
+    tokenizer, text, device='cpu'
+)
+print(f"Original: {text}")
+print(f"Normalized: {normalized_text}")
+```
+### NSW Detection
+```python
+# Detect Non-Standard Words (NSW) in text
+text = "nhìn thôi cung thấy đau long quá đi :))"
+nsw_results = model.detect_nsw(tokenizer, text, device='cpu')
+print(f"Text: {text}")
+for result in nsw_results:
+    print(f"NSW: '{result['nsw']}' → '{result['prediction']}' (confidence: {result['confidence_score']})")
+```
+### Batch Processing
+```python
+texts = [
+    "sv dh gia dinh chua cho di lam :))",
+    "chúng nó bảo em là ctrai",
+    "t vs b chơi vs nhau đã lâu"
+]
+for text in texts:
+    normalized_text, _, _ = model.normalize_text(tokenizer, text, device='cpu')
+    print(f"{text} → {normalized_text}")
+```
+### Expected Output
+#### Text Normalization
+```
+sv dh gia dinh chua cho di lam :)) → sinh viên đại học gia đình chưa cho đi làm :))
+chúng nó bảo em là ctrai → chúng nó bảo em là con trai
+t vs b chơi vs nhau đã lâu → tôi với bạn chơi với nhau đã lâu
+```
+#### NSW Detection
+```python
+# Input: "nhìn thôi cung thấy đau long quá đi :))"
+[
+  {
+    "index": 3,
+    "start_index": 10,
+    "end_index": 14,
+    "nsw": "cung",
+    "prediction": "cũng",
+    "confidence_score": 0.9415
+  },
+  {
+    "index": 6,
+    "start_index": 24,
+    "end_index": 28,
+    "nsw": "long",
+    "prediction": "lòng",
+    "confidence_score": 0.7056
+  }
+]
+```
+### NSW Detection Output Format
+The `detect_nsw` method returns a list of dictionaries with the following structure:
+- **`index`**: Position of the token in the sequence
+- **`start_index`**: Start character position in the original text
+- **`end_index`**: End character position in the original text
+- **`nsw`**: The original non-standard word (detokenized)
+- **`prediction`**: The predicted normalized word (detokenized)
+- **`confidence_score`**: Combined confidence score (0.0 to 1.0)

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<space>": 15002
+}

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "ViSoNormViSoBERTForMaskedLM"
+  ],
+  "model_type": "xlm-roberta",
+  "vocab_size": 15003,
+  "pad_token_id": 1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "mask_token_id": 3,
+  "mask_n_predictor": true,
+  "nsw_detector": true,
+  "auto_map": {
+    "AutoModel": "visonorm_visobert_model.ViSoNormViSoBERTForMaskedLM",
+    "AutoModelForMaskedLM": "visonorm_visobert_model.ViSoNormViSoBERTForMaskedLM"
+  },
+  "hidden_size": 768,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 514,
+  "type_vocab_size": 2,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-12
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c56f8433e6e26b0d6411414956a01eee2acc0c0471173b6e74e74a352b84993
+size 393240883

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02aaf05cda4db99e86b7c76eba6258867ce4d043da0fed19c87a7d46c8b53a65
+size 470732

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

state_dict_report.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "base_model": "visobert",
+  "total_params": 213,
+  "expected_heads_present": {
+    "cls_decoder.weight": false,
+    "cls_decoder.bias": false,
+    "cls_dense.weight": false,
+    "cls_dense.bias": false,
+    "cls_layer_norm.weight": false,
+    "cls_layer_norm.bias": false,
+    "mask_n_predictor.mask_predictor_dense.weight": true,
+    "mask_n_predictor.mask_predictor_dense.bias": true,
+    "mask_n_predictor.mask_predictor_proj.weight": true,
+    "mask_n_predictor.mask_predictor_proj.bias": true,
+    "nsw_detector.dense.weight": true,
+    "nsw_detector.dense.bias": true,
+    "nsw_detector.predictor.weight": true,
+    "nsw_detector.predictor.bias": true
+  },
+  "alt_common_heads_present": {
+    "lm_head.weight": false,
+    "lm_head.bias": false,
+    "cls.decoder.weight": true,
+    "cls.decoder.bias": true,
+    "cls.dense.weight": true,
+    "cls.dense.bias": true,
+    "cls.layer_norm.weight": true,
+    "cls.layer_norm.bias": true
+  },
+  "aux_heads_present": {
+    "nsw_detector.": true,
+    "mask_n_predictor.": true
+  },
+  "example_keys": [
+    "roberta.embeddings.word_embeddings.weight",
+    "roberta.embeddings.position_embeddings.weight",
+    "roberta.embeddings.token_type_embeddings.weight",
+    "roberta.embeddings.LayerNorm.weight",
+    "roberta.embeddings.LayerNorm.bias",
+    "roberta.encoder.layer.0.attention.self.query.weight",
+    "roberta.encoder.layer.0.attention.self.query.bias",
+    "roberta.encoder.layer.0.attention.self.key.weight",
+    "roberta.encoder.layer.0.attention.self.key.bias",
+    "roberta.encoder.layer.0.attention.self.value.weight",
+    "roberta.encoder.layer.0.attention.self.value.bias",
+    "roberta.encoder.layer.0.attention.output.dense.weight",
+    "roberta.encoder.layer.0.attention.output.dense.bias",
+    "roberta.encoder.layer.0.attention.output.LayerNorm.weight",
+    "roberta.encoder.layer.0.attention.output.LayerNorm.bias",
+    "roberta.encoder.layer.0.intermediate.dense.weight",
+    "roberta.encoder.layer.0.intermediate.dense.bias",
+    "roberta.encoder.layer.0.output.dense.weight",
+    "roberta.encoder.layer.0.output.dense.bias",
+    "roberta.encoder.layer.0.output.LayerNorm.weight"
+  ]
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "model_max_length": 512,
+  "padding_side": "right",
+  "truncation_side": "right",
+  "pad_token": "<pad>",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "unk_token": "<unk>",
+  "mask_token": "<mask>",
+  "additional_special_tokens": [
+    "<pad>",
+    "<s>",
+    "</s>",
+    "<unk>",
+    "<mask>"
+  ],
+  "use_fast": false
+}

training_args.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "base_model": "visobert",
+  "training_mode": "weakly_supervised",
+  "learning_rate": 0.001,
+  "num_epochs": 10,
+  "train_batch_size": 16,
+  "eval_batch_size": 128,
+  "remove_accents": false,
+  "lower_case": false
+}

visonorm_visobert_model.py ADDED Viewed

	@@ -0,0 +1,557 @@

+#!/usr/bin/env python3
+"""
+Custom ViSoNorm model class for ViSoBERT-based models.
+This preserves the custom heads needed for text normalization and
+is loadable via auto_map without custom model_type.
+"""
+import math
+import torch
+import torch.nn as nn
+from transformers import XLMRobertaModel, XLMRobertaConfig, XLMRobertaPreTrainedModel
+from transformers.modeling_outputs import MaskedLMOutput
+# Define constants locally to avoid external dependencies
+NUM_LABELS_N_MASKS = 5
+def gelu(x):
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+class XLMRobertaLMHead(nn.Module):
+    def __init__(self, config, xlmroberta_model_embedding_weights):
+        super().__init__()
+        # Use the actual hidden size from the pretrained model, not the config
+        actual_hidden_size = xlmroberta_model_embedding_weights.size(1)
+        self.dense = nn.Linear(actual_hidden_size, actual_hidden_size)
+        self.layer_norm = nn.LayerNorm(actual_hidden_size, eps=1e-12)
+        num_labels = xlmroberta_model_embedding_weights.size(0)
+        self.decoder = nn.Linear(actual_hidden_size, num_labels, bias=False)
+        self.decoder.weight = xlmroberta_model_embedding_weights
+        self.decoder.bias = nn.Parameter(torch.zeros(num_labels))
+    def forward(self, features):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+        x = self.decoder(x)
+        return x
+class XLMRobertaMaskNPredictionHead(nn.Module):
+    def __init__(self, config, actual_hidden_size):
+        super(XLMRobertaMaskNPredictionHead, self).__init__()
+        self.mask_predictor_dense = nn.Linear(actual_hidden_size, 50)
+        self.mask_predictor_proj = nn.Linear(50, NUM_LABELS_N_MASKS)
+        self.activation = gelu
+    def forward(self, sequence_output):
+        mask_predictor_state = self.activation(self.mask_predictor_dense(sequence_output))
+        prediction_scores = self.mask_predictor_proj(mask_predictor_state)
+        return prediction_scores
+class XLMRobertaBinaryPredictor(nn.Module):
+    def __init__(self, hidden_size, dense_dim=100):
+        super(XLMRobertaBinaryPredictor, self).__init__()
+        self.dense = nn.Linear(hidden_size, dense_dim)
+        # Use 'predictor' to match the checkpoint parameter names
+        self.predictor = nn.Linear(dense_dim, 2)
+        self.activation = gelu
+    def forward(self, sequence_output):
+        state = self.activation(self.dense(sequence_output))
+        prediction_scores = self.predictor(state)
+        return prediction_scores
+class ViSoNormViSoBERTForMaskedLM(XLMRobertaPreTrainedModel):
+    config_class = XLMRobertaConfig
+    def __init__(self, config: XLMRobertaConfig):
+        super().__init__(config)
+        self.roberta = XLMRobertaModel(config)
+        # Get actual hidden size from the pretrained model
+        actual_hidden_size = self.roberta.embeddings.word_embeddings.weight.size(1)
+        # ViSoNorm normalization head - use exact same structure as training
+        self.cls = XLMRobertaLMHead(config, self.roberta.embeddings.word_embeddings.weight)
+        # Additional heads for ViSoNorm functionality
+        self.mask_n_predictor = XLMRobertaMaskNPredictionHead(config, actual_hidden_size)
+        self.nsw_detector = XLMRobertaBinaryPredictor(actual_hidden_size, dense_dim=100)
+        self.num_labels_n_mask = NUM_LABELS_N_MASKS
+        # Initialize per HF conventions
+        self.post_init()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        # Calculate all three prediction heads
+        logits_norm = self.cls(sequence_output)
+        logits_n_masks_pred = self.mask_n_predictor(sequence_output)
+        logits_nsw_detection = self.nsw_detector(sequence_output)
+        if not return_dict:
+            return (logits_norm, logits_n_masks_pred, logits_nsw_detection) + outputs[1:]
+        # Return all prediction heads for ViSoNorm inference
+        # Create a custom output object that contains all three heads
+        class ViSoNormOutput:
+            def __init__(self, logits_norm, logits_n_masks_pred, logits_nsw_detection, hidden_states=None, attentions=None):
+                self.logits = logits_norm
+                self.logits_norm = logits_norm
+                self.logits_n_masks_pred = logits_n_masks_pred
+                self.logits_nsw_detection = logits_nsw_detection
+                self.hidden_states = hidden_states
+                self.attentions = attentions
+        return ViSoNormOutput(
+            logits_norm=logits_norm,
+            logits_n_masks_pred=logits_n_masks_pred,
+            logits_nsw_detection=logits_nsw_detection,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def normalize_text(self, tokenizer, text, device='cpu'):
+        """
+        Normalize text using the ViSoNorm ViSoBERT model with proper NSW detection and masking.
+        Args:
+            tokenizer: HuggingFace tokenizer
+            text: Input text to normalize
+            device: Device to run inference on
+        Returns:
+            Tuple of (normalized_text, source_tokens, prediction_tokens)
+        """
+        # Move model to device
+        self.to(device)
+        # Step 1: Preprocess text exactly like training data
+        # Tokenize the input text into tokens (not IDs yet)
+        input_tokens = tokenizer.tokenize(text)
+        # Add special tokens like in training
+        input_tokens = ['<s>'] + input_tokens + ['</s>']
+        # Convert tokens to IDs
+        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
+        input_tokens_tensor = torch.LongTensor([input_ids]).to(device)
+        # Step 2: Apply the same truncation and masking logic as training
+        input_tokens_tensor, _, token_type_ids, input_mask = self._truncate_and_build_masks(input_tokens_tensor)
+        # Step 3: Get all three prediction heads from ViSoNorm model
+        self.eval()
+        with torch.no_grad():
+            if hasattr(self, 'roberta'):
+                outputs = self(input_tokens_tensor, token_type_ids, input_mask)
+            else:
+                outputs = self(input_tokens_tensor, input_mask)
+        # Step 4: Use NSW detector to identify tokens that need normalization
+        tokens = tokenizer.convert_ids_to_tokens(input_tokens_tensor[0])
+        if hasattr(outputs, 'logits_nsw_detection') and outputs.logits_nsw_detection is not None:
+            # Handle different output shapes
+            if outputs.logits_nsw_detection.dim() == 3:  # (batch, seq_len, 2) - binary classification
+                nsw_predictions = torch.argmax(outputs.logits_nsw_detection[0], dim=-1) == 1
+            else:  # (batch, seq_len) - single output
+                nsw_predictions = torch.sigmoid(outputs.logits_nsw_detection[0]) > 0.5
+            tokens_need_norm = []
+            for i, token in enumerate(tokens):
+                # Skip special tokens
+                if token in ['<s>', '</s>', '<pad>', '<unk>', '<mask>']:
+                    tokens_need_norm.append(False)
+                else:
+                    if i < len(nsw_predictions):
+                        tokens_need_norm.append(nsw_predictions[i].item())
+                    else:
+                        tokens_need_norm.append(False)
+        else:
+            # Fallback: assume all non-special tokens need checking
+            tokens_need_norm = [token not in ['<s>', '</s>', '<pad>', '<unk>', '<mask>'] for token in tokens]
+        # Update NSW tokens list (purely model-driven or generic non-special fallback)
+        nsw_tokens = [tokens[i] for i, need in enumerate(tokens_need_norm) if need]
+        # Step 5: Greedy 0/1-mask selection when heads are unusable
+        # Try, per NSW position, whether adding one mask improves sequence likelihood
+        def _score_sequence(input_ids_tensor: torch.Tensor) -> float:
+            with torch.no_grad():
+                scored = self(input_ids=input_ids_tensor, attention_mask=torch.ones_like(input_ids_tensor))
+                logits = scored.logits_norm if hasattr(scored, 'logits_norm') else scored.logits
+                log_probs = torch.log_softmax(logits[0], dim=-1)
+                # Score by taking the max log-prob at each position (approximate sequence likelihood)
+                position_scores, _ = torch.max(log_probs, dim=-1)
+                return float(position_scores.mean().item())
+        mask_token_id = tokenizer.convert_tokens_to_ids('<mask>')
+        working_ids = input_tokens_tensor[0].detach().clone().cpu().tolist()
+        nsw_indices = [i for i, need in enumerate(tokens_need_norm) if need]
+        offset = 0
+        for i in nsw_indices:
+            pos = i + offset
+            # Candidate A: no mask
+            cand_a = working_ids
+            score_a = _score_sequence(torch.tensor([cand_a], device=device))
+            # Candidate B: add one mask after pos
+            cand_b = working_ids[:pos+1] + [mask_token_id] + working_ids[pos+1:]
+            score_b = _score_sequence(torch.tensor([cand_b], device=device))
+            if score_b > score_a:
+                working_ids = cand_b
+                offset += 1
+        # Final prediction on the chosen masked sequence (may be unchanged)
+        masked_input_ids = torch.tensor([working_ids], device=device)
+        with torch.no_grad():
+            final_outputs = self(input_ids=masked_input_ids, attention_mask=torch.ones_like(masked_input_ids))
+        logits_final = final_outputs.logits_norm if hasattr(final_outputs, 'logits_norm') else final_outputs.logits
+        pred_ids = torch.argmax(logits_final, dim=-1)[0].cpu().tolist()
+        # Build final token ids by taking predictions at positions; keep originals at specials
+        final_tokens = []
+        for idx, src_id in enumerate(working_ids):
+            tok = tokenizer.convert_ids_to_tokens([src_id])[0]
+            if tok in ['<s>', '</s>', '<pad>', '<unk>']:
+                final_tokens.append(src_id)
+            else:
+                final_tokens.append(pred_ids[idx] if idx < len(pred_ids) else src_id)
+        # Step 9: Convert to final text
+        def remove_special_tokens(token_list):
+            special_tokens = ['<s>', '</s>', '<pad>', '<unk>', '<mask>', '<space>']
+            return [token for token in token_list if token not in special_tokens]
+        def _safe_ids_to_text(token_ids):
+            if not token_ids:
+                return ""
+            try:
+                tokens = tokenizer.convert_ids_to_tokens(token_ids)
+                cleaned = remove_special_tokens(tokens)
+                if not cleaned:
+                    return ""
+                return tokenizer.convert_tokens_to_string(cleaned)
+            except Exception:
+                return ""
+        # Build final normalized text
+        final_tokens = [tid for tid in final_tokens if tid != -1]
+        pred_str = _safe_ids_to_text(final_tokens)
+        # Collapse repeated whitespace
+        if pred_str:
+            pred_str = ' '.join(pred_str.split())
+        # Also return token lists for optional inspection
+        decoded_source = tokenizer.convert_ids_to_tokens(working_ids)
+        decoded_pred = tokenizer.convert_ids_to_tokens(final_tokens)
+        return pred_str, decoded_source, decoded_pred
+    def detect_nsw(self, tokenizer, text, device='cpu'):
+        """
+        Detect Non-Standard Words (NSW) in text and return detailed information.
+        This method aligns with normalize_text to ensure consistent NSW detection.
+        Args:
+            tokenizer: HuggingFace tokenizer
+            text: Input text to analyze
+            device: Device to run inference on
+        Returns:
+            List of dictionaries containing NSW information:
+            [{'index': int, 'start_index': int, 'end_index': int, 'nsw': str,
+              'prediction': str, 'confidence_score': float}, ...]
+        """
+        # Move model to device
+        self.to(device)
+        # Step 1: Preprocess text exactly like normalize_text
+        input_tokens = tokenizer.tokenize(text)
+        input_tokens = ['<s>'] + input_tokens + ['</s>']
+        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
+        input_tokens_tensor = torch.LongTensor([input_ids]).to(device)
+        # Step 2: Apply the same truncation and masking logic as normalize_text
+        input_tokens_tensor, _, token_type_ids, input_mask = self._truncate_and_build_masks(input_tokens_tensor)
+        # Step 3: Get all three prediction heads from ViSoNorm model (same as normalize_text)
+        self.eval()
+        with torch.no_grad():
+            if hasattr(self, 'roberta'):
+                outputs = self(input_tokens_tensor, token_type_ids, input_mask)
+            else:
+                outputs = self(input_tokens_tensor, input_mask)
+        # Step 4: Use NSW detector to identify tokens that need normalization (same logic as normalize_text)
+        tokens = tokenizer.convert_ids_to_tokens(input_tokens_tensor[0])
+        if hasattr(outputs, 'logits_nsw_detection') and outputs.logits_nsw_detection is not None:
+            # Handle different output shapes (same as normalize_text)
+            if outputs.logits_nsw_detection.dim() == 3:  # (batch, seq_len, 2) - binary classification
+                nsw_predictions = torch.argmax(outputs.logits_nsw_detection[0], dim=-1) == 1
+                nsw_confidence = torch.softmax(outputs.logits_nsw_detection[0], dim=-1)[:, 1]
+            else:  # (batch, seq_len) - single output
+                nsw_predictions = torch.sigmoid(outputs.logits_nsw_detection[0]) > 0.5
+                nsw_confidence = torch.sigmoid(outputs.logits_nsw_detection[0])
+            tokens_need_norm = []
+            for i, token in enumerate(tokens):
+                # Skip special tokens (same as normalize_text)
+                if token in ['<s>', '</s>', '<pad>', '<unk>', '<mask>']:
+                    tokens_need_norm.append(False)
+                else:
+                    if i < len(nsw_predictions):
+                        tokens_need_norm.append(nsw_predictions[i].item())
+                    else:
+                        tokens_need_norm.append(False)
+        else:
+            # Fallback: assume all non-special tokens need checking (same as normalize_text)
+            tokens_need_norm = [token not in ['<s>', '</s>', '<pad>', '<unk>', '<mask>'] for token in tokens]
+        # Step 5: Apply the same masking strategy as normalize_text
+        def _score_sequence(input_ids_tensor: torch.Tensor) -> float:
+            with torch.no_grad():
+                scored = self(input_ids=input_ids_tensor, attention_mask=torch.ones_like(input_ids_tensor))
+                logits = scored.logits_norm if hasattr(scored, 'logits_norm') else scored.logits
+                log_probs = torch.log_softmax(logits[0], dim=-1)
+                position_scores, _ = torch.max(log_probs, dim=-1)
+                return float(position_scores.mean().item())
+        mask_token_id = tokenizer.convert_tokens_to_ids('<mask>')
+        working_ids = input_tokens_tensor[0].detach().clone().cpu().tolist()
+        nsw_indices = [i for i, need in enumerate(tokens_need_norm) if need]
+        offset = 0
+        for i in nsw_indices:
+            pos = i + offset
+            # Candidate A: no mask
+            cand_a = working_ids
+            score_a = _score_sequence(torch.tensor([cand_a], device=device))
+            # Candidate B: add one mask after pos
+            cand_b = working_ids[:pos+1] + [mask_token_id] + working_ids[pos+1:]
+            score_b = _score_sequence(torch.tensor([cand_b], device=device))
+            if score_b > score_a:
+                working_ids = cand_b
+                offset += 1
+        # Step 6: Get final predictions using the same masked sequence as normalize_text
+        masked_input_ids = torch.tensor([working_ids], device=device)
+        with torch.no_grad():
+            final_outputs = self(input_ids=masked_input_ids, attention_mask=torch.ones_like(masked_input_ids))
+        logits_final = final_outputs.logits_norm if hasattr(final_outputs, 'logits_norm') else final_outputs.logits
+        pred_ids = torch.argmax(logits_final, dim=-1)[0].cpu().tolist()
+        # Step 7: Build results using the same logic as normalize_text
+        # We need to identify NSW tokens by comparing original vs predicted tokens
+        # This ensures we catch all tokens that were actually changed, not just those detected by NSW head
+        nsw_results = []
+        # Build final token ids by taking predictions at positions; keep originals at specials (same as normalize_text)
+        final_tokens = []
+        for idx, src_id in enumerate(working_ids):
+            tok = tokenizer.convert_ids_to_tokens([src_id])[0]
+            if tok in ['<s>', '</s>', '<pad>', '<unk>']:
+                final_tokens.append(src_id)
+            else:
+                final_tokens.append(pred_ids[idx] if idx < len(pred_ids) else src_id)
+        # Convert final tokens to normalized text (same as normalize_text)
+        def remove_special_tokens(token_list):
+            special_tokens = ['<s>', '</s>', '<pad>', '<unk>', '<mask>', '<space>']
+            return [token for token in token_list if token not in special_tokens]
+        def _safe_ids_to_text(token_ids):
+            if not token_ids:
+                return ""
+            try:
+                tokens = tokenizer.convert_ids_to_tokens(token_ids)
+                cleaned = remove_special_tokens(tokens)
+                if not cleaned:
+                    return ""
+                return tokenizer.convert_tokens_to_string(cleaned)
+            except Exception:
+                return ""
+        # Build final normalized text
+        final_tokens_cleaned = [tid for tid in final_tokens if tid != -1]
+        normalized_text = _safe_ids_to_text(final_tokens_cleaned)
+        # Collapse repeated whitespace
+        if normalized_text:
+            normalized_text = ' '.join(normalized_text.split())
+        # Now compare original text tokens with normalized text tokens
+        original_tokens = tokenizer.tokenize(text)
+        normalized_tokens = tokenizer.tokenize(normalized_text)
+        # Use a smarter approach that can handle multi-token expansions
+        # Get the source and predicted tokens from the model
+        decoded_source = tokenizer.convert_ids_to_tokens(working_ids)
+        decoded_pred = tokenizer.convert_ids_to_tokens(final_tokens)
+        # Clean the tokens (remove special tokens and ▁ prefix)
+        def clean_token(token):
+            if token in ['<s>', '</s>', '<pad>', '<unk>', '<mask>']:
+                return None
+            return token.strip().lstrip('▁')
+        # Group consecutive predictions that form expansions
+        i = 0
+        while i < len(decoded_source):
+            src_token = decoded_source[i]
+            clean_src = clean_token(src_token)
+            if clean_src is None:
+                i += 1
+                continue
+            # Check if this token was changed
+            pred_token = decoded_pred[i]
+            clean_pred = clean_token(pred_token)
+            if clean_pred is None:
+                i += 1
+                continue
+            if clean_src != clean_pred:
+                # This is an NSW token - check if it's part of an expansion
+                expansion_tokens = [clean_pred]
+                j = i + 1
+                # Look for consecutive mask tokens that were filled
+                while j < len(decoded_source) and j < len(decoded_pred):
+                    next_src = decoded_source[j]
+                    next_pred = decoded_pred[j]
+                    # If the source is a mask token, it was added for expansion
+                    if next_src == '<mask>':
+                        clean_next_pred = clean_token(next_pred)
+                        if clean_next_pred is not None:
+                            expansion_tokens.append(clean_next_pred)
+                        j += 1
+                    else:
+                        # Check if the next source token was also changed
+                        clean_next_src = clean_token(next_src)
+                        clean_next_pred = clean_token(next_pred)
+                        if clean_next_src is not None and clean_next_pred is not None and clean_next_src != clean_next_pred:
+                            # This is also a changed token, might be part of expansion
+                            # But we need to be careful not to group unrelated changes
+                            # For now, let's be conservative and only group mask-based expansions
+                            break
+                        else:
+                            break
+                # Create the expansion text
+                expansion_text = ' '.join(expansion_tokens)
+                # This is an NSW token
+                start_idx = text.find(clean_src)
+                end_idx = start_idx + len(clean_src) if start_idx != -1 else len(clean_src)
+                # Calculate confidence score
+                if hasattr(outputs, 'logits_nsw_detection') and outputs.logits_nsw_detection is not None:
+                    # Find the corresponding position in the original token list
+                    orig_pos = None
+                    for k, tok in enumerate(tokens):
+                        if tok.strip().lstrip('▁') == clean_src:
+                            orig_pos = k
+                            break
+                    if orig_pos is not None and orig_pos < len(nsw_confidence):
+                        if outputs.logits_nsw_detection.dim() == 3:
+                            nsw_conf = nsw_confidence[orig_pos].item()
+                        else:
+                            nsw_conf = nsw_confidence[orig_pos].item()
+                    else:
+                        nsw_conf = 0.5  # Default if position not found
+                    # Get normalization confidence
+                    norm_logits = logits_final[0]  # Use final masked logits
+                    norm_confidence = torch.softmax(norm_logits, dim=-1)
+                    norm_conf = norm_confidence[i][final_tokens[i]].item()
+                    combined_confidence = (nsw_conf + norm_conf) / 2
+                else:
+                    combined_confidence = 0.5  # Default confidence if no NSW detector
+                nsw_results.append({
+                    'index': i,
+                    'start_index': start_idx,
+                    'end_index': end_idx,
+                    'nsw': clean_src,
+                    'prediction': expansion_text,
+                    'confidence_score': round(combined_confidence, 4)
+                })
+                # Move to the next unprocessed token
+                i = j
+            else:
+                i += 1
+        return nsw_results
+    def _truncate_and_build_masks(self, input_tokens_tensor, output_tokens_tensor=None):
+        """Apply the same truncation and masking logic as training."""
+        if hasattr(self, 'roberta'):
+            cfg_max = int(getattr(self.roberta.config, 'max_position_embeddings', input_tokens_tensor.size(1)))
+            tbl_max = int(getattr(self.roberta.embeddings.position_embeddings, 'num_embeddings', cfg_max))
+            max_pos = min(cfg_max, tbl_max)
+            eff_max = max(1, max_pos - 2)
+            if input_tokens_tensor.size(1) > eff_max:
+                input_tokens_tensor = input_tokens_tensor[:, :eff_max]
+                if output_tokens_tensor is not None and output_tokens_tensor.dim() == 2 and output_tokens_tensor.size(1) > eff_max:
+                    output_tokens_tensor = output_tokens_tensor[:, :eff_max]
+            pad_id_model = getattr(self.roberta.config, 'pad_token_id', None)
+            if pad_id_model is None:
+                pad_id_model = getattr(self.roberta.embeddings.word_embeddings, 'padding_idx', None)
+            if pad_id_model is None:
+                pad_id_model = 1  # Default pad token ID
+            input_mask = (input_tokens_tensor != pad_id_model).long()
+            token_type_ids = torch.zeros_like(input_tokens_tensor)
+            return input_tokens_tensor, output_tokens_tensor, token_type_ids, input_mask
+        # bart branch
+        pad_id_model = 1
+        input_mask = torch.ones_like(input_tokens_tensor)
+        token_type_ids = None
+        return input_tokens_tensor, output_tokens_tensor, token_type_ids, input_mask
+__all__ = ["ViSoNormViSoBERTForMaskedLM"]