rtferraz's picture
Phase 2A: Core tokenizer library — schema, field tokenizers, composite builder, predefined schemas, 72 passing tests
0c1ca58 verified
raw
history blame
895 Bytes
"""
domainTokenizer — Building small models that understand domain tokens, not just words.
Core components:
- schema: DomainSchema, FieldSpec, FieldType
- tokenizers: DomainTokenizerBuilder, per-field tokenizers
- schemas: Predefined schemas (FINANCE, ECOMMERCE, HEALTHCARE)
"""
from .schema import DomainSchema, FieldSpec, FieldType
from .tokenizers.domain_tokenizer import DomainTokenizerBuilder
from .tokenizers.field_tokenizers import (
BaseFieldTokenizer,
CalendarTokenizer,
CategoricalTokenizer,
DiscreteNumericalTokenizer,
MagnitudeBucketTokenizer,
SignTokenizer,
)
__version__ = "0.1.0"
__all__ = [
"DomainSchema",
"FieldSpec",
"FieldType",
"DomainTokenizerBuilder",
"BaseFieldTokenizer",
"SignTokenizer",
"MagnitudeBucketTokenizer",
"DiscreteNumericalTokenizer",
"CalendarTokenizer",
"CategoricalTokenizer",
]