rtferraz
/

domainTokenizer

Model card Files Files and versions

xet

Community

rtferraz commited on 8 days ago

Commit

1a9dad0

verified ·

1 Parent(s): 0c1ca58

Add schema.py — DomainSchema, FieldSpec, FieldType definitions

Browse files

Files changed (1) hide show

src/domain_tokenizer/schema.py +180 -0

src/domain_tokenizer/schema.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Domain Schema Definition for domainTokenizer.
+A declarative format that describes the fields in a domain's event data.
+Each field type maps to a specific tokenization strategy.
+References:
+  - Nubank nuFormer (arXiv:2507.23267): amount sign(2) + amount bucket(21) + calendar(74) + BPE text
+  - ActionPiece (arXiv:2502.13581): unordered feature sets tokenized via BPE-like merging
+  - Banking TF (arXiv:2410.08243): date + amount + wording composite tokens
+"""
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Any
+class FieldType(Enum):
+    """Supported field types for domain tokenization.
+    Each type maps to a specific tokenization strategy:
+      SIGN               → 2 tokens (positive/negative, credit/debit)
+      NUMERICAL_CONTINUOUS → quantile-based magnitude bins (default: 21, following Nubank)
+      NUMERICAL_DISCRETE  → small fixed vocabulary for countable values (quantities, counts)
+      CATEGORICAL_FIXED   → direct vocabulary mapping from a known set of categories
+      TEMPORAL            → calendar decomposition into month/dow/dom/hour tokens
+      TEXT                → BPE subword tokenization (for descriptions, names, free text)
+    """
+    SIGN = "sign"
+    NUMERICAL_CONTINUOUS = "numerical_continuous"
+    NUMERICAL_DISCRETE = "numerical_discrete"
+    CATEGORICAL_FIXED = "categorical_fixed"
+    TEMPORAL = "temporal"
+    TEXT = "text"
+# Mapping from calendar field name → number of tokens it produces
+CALENDAR_FIELD_SIZES: Dict[str, int] = {
+    "month": 12,
+    "dow": 7,       # day of week
+    "dom": 31,      # day of month
+    "hour": 24,
+    "quarter": 4,
+    "minute_bin": 4, # 15-min bins: 0-14, 15-29, 30-44, 45-59
+}
+@dataclass
+class FieldSpec:
+    """Specification for a single field in a domain event.
+    Args:
+        name: Field name (must match the key in event dictionaries).
+        field_type: How this field should be tokenized.
+        prefix: Token prefix override. Defaults to uppercase field name.
+        n_bins: Number of quantile bins for NUMERICAL_CONTINUOUS (default: 21, Nubank).
+        categories: List of category values for CATEGORICAL_FIXED.
+        vocab_size: Explicit vocab size for NUMERICAL_DISCRETE (e.g., 11 for quantities 0-10).
+        calendar_fields: Which calendar components to extract for TEMPORAL fields.
+        max_value: Upper bound for NUMERICAL_DISCRETE (tokens: 0..max_value, max_value+).
+    """
+    name: str
+    field_type: FieldType
+    prefix: Optional[str] = None
+    n_bins: int = 21
+    categories: Optional[List[str]] = None
+    vocab_size: Optional[int] = None
+    calendar_fields: List[str] = field(default_factory=lambda: ["month", "dow", "dom", "hour"])
+    max_value: Optional[int] = None
+    def __post_init__(self):
+        if self.prefix is None:
+            self.prefix = self.name.upper()
+        # Validation
+        if self.field_type == FieldType.CATEGORICAL_FIXED and self.categories is None:
+            raise ValueError(f"Field '{self.name}': CATEGORICAL_FIXED requires 'categories' list")
+        if self.field_type == FieldType.NUMERICAL_DISCRETE and self.max_value is None:
+            raise ValueError(f"Field '{self.name}': NUMERICAL_DISCRETE requires 'max_value'")
+    @property
+    def token_count(self) -> int:
+        """Number of special tokens this field produces in the vocabulary."""
+        if self.field_type == FieldType.SIGN:
+            return 2
+        elif self.field_type == FieldType.NUMERICAL_CONTINUOUS:
+            return self.n_bins
+        elif self.field_type == FieldType.NUMERICAL_DISCRETE:
+            return self.max_value + 2  # 0..max_value + overflow bin
+        elif self.field_type == FieldType.CATEGORICAL_FIXED:
+            return len(self.categories) + 1  # +1 for unknown
+        elif self.field_type == FieldType.TEMPORAL:
+            return sum(CALENDAR_FIELD_SIZES.get(cf, 0) for cf in self.calendar_fields)
+        elif self.field_type == FieldType.TEXT:
+            return 0  # text tokens come from BPE, not special vocab
+        return 0
+    @property
+    def tokens_per_event(self) -> int:
+        """Number of tokens this field contributes per event (fixed part only)."""
+        if self.field_type == FieldType.SIGN:
+            return 1
+        elif self.field_type in (FieldType.NUMERICAL_CONTINUOUS, FieldType.NUMERICAL_DISCRETE, FieldType.CATEGORICAL_FIXED):
+            return 1
+        elif self.field_type == FieldType.TEMPORAL:
+            return len(self.calendar_fields)
+        elif self.field_type == FieldType.TEXT:
+            return 0  # variable length
+        return 0
+@dataclass
+class DomainSchema:
+    """Complete schema for a domain's event data.
+    A schema defines the ordered list of fields that make up each event
+    (transaction, purchase, clinical encounter, etc.). The field order
+    determines the token order within each event.
+    Args:
+        name: Human-readable domain name (e.g., "finance", "ecommerce").
+        fields: Ordered list of field specifications.
+        event_separator: Special token to separate events in a sequence.
+        description: Optional human-readable description.
+    """
+    name: str
+    fields: List[FieldSpec]
+    event_separator: str = "[SEP_EVENT]"
+    description: str = ""
+    @property
+    def special_token_count(self) -> int:
+        """Total number of domain-specific special tokens needed."""
+        # Base special tokens: PAD, UNK, BOS, EOS, MASK, CLS, SEP, event separator
+        base = 8
+        return base + sum(f.token_count for f in self.fields)
+    @property
+    def fixed_tokens_per_event(self) -> int:
+        """Number of fixed (non-text) tokens per event, including separator."""
+        return 1 + sum(f.tokens_per_event for f in self.fields)  # +1 for event separator
+    @property
+    def has_text_fields(self) -> bool:
+        """Whether the schema includes any free-text fields."""
+        return any(f.field_type == FieldType.TEXT for f in self.fields)
+    @property
+    def text_field_names(self) -> List[str]:
+        """Names of all text fields in the schema."""
+        return [f.name for f in self.fields if f.field_type == FieldType.TEXT]
+    @property
+    def fittable_field_names(self) -> List[str]:
+        """Names of fields that require fitting on training data."""
+        return [f.name for f in self.fields if f.field_type == FieldType.NUMERICAL_CONTINUOUS]
+    def get_field(self, name: str) -> Optional[FieldSpec]:
+        """Look up a field by name."""
+        for f in self.fields:
+            if f.name == name:
+                return f
+        return None
+    def summary(self) -> str:
+        """Human-readable summary of the schema."""
+        lines = [f"DomainSchema: {self.name}"]
+        if self.description:
+            lines.append(f"  {self.description}")
+        lines.append(f"  Fields: {len(self.fields)}")
+        lines.append(f"  Special tokens: {self.special_token_count}")
+        lines.append(f"  Fixed tokens/event: {self.fixed_tokens_per_event}")
+        lines.append(f"  Has text fields: {self.has_text_fields}")
+        lines.append(f"  Requires fitting: {self.fittable_field_names}")
+        lines.append("")
+        for f in self.fields:
+            lines.append(f"  [{f.field_type.value}] {f.name} → prefix={f.prefix}, "
+                        f"tokens_in_vocab={f.token_count}, tokens_per_event={f.tokens_per_event}")
+        return "\n".join(lines)