Add predefined schemas (FINANCE, ECOMMERCE, HEALTHCARE)
Browse files
src/domain_tokenizer/schemas/predefined.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Predefined domain schemas for common use cases.
|
| 3 |
+
|
| 4 |
+
Each schema follows the validated patterns from the research:
|
| 5 |
+
- FINANCE_SCHEMA: Based on Nubank nuFormer (arXiv:2507.23267) — 97 special tokens
|
| 6 |
+
- ECOMMERCE_SCHEMA: Adapted from ActionPiece (arXiv:2502.13581) + nuFormer patterns
|
| 7 |
+
- HEALTHCARE_SCHEMA: Clinical event sequences
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from ..schema import DomainSchema, FieldSpec, FieldType
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# =============================================================================
|
| 14 |
+
# FINANCE SCHEMA — Based on Nubank nuFormer
|
| 15 |
+
# sign(2) + amount_bucket(21) + month(12) + dow(7) + dom(31) + hour(24) = 97
|
| 16 |
+
# =============================================================================
|
| 17 |
+
|
| 18 |
+
FINANCE_SCHEMA = DomainSchema(
|
| 19 |
+
name="finance",
|
| 20 |
+
description=(
|
| 21 |
+
"Financial transaction schema following Nubank nuFormer (arXiv:2507.23267). "
|
| 22 |
+
"Each transaction = sign + amount bucket + calendar features + text description. "
|
| 23 |
+
"~14 tokens per transaction, 2048 context = ~146 transactions."
|
| 24 |
+
),
|
| 25 |
+
fields=[
|
| 26 |
+
FieldSpec(name="amount_sign", field_type=FieldType.SIGN, prefix="AMT_SIGN"),
|
| 27 |
+
FieldSpec(name="amount", field_type=FieldType.NUMERICAL_CONTINUOUS, prefix="AMT", n_bins=21),
|
| 28 |
+
FieldSpec(name="timestamp", field_type=FieldType.TEMPORAL,
|
| 29 |
+
calendar_fields=["month", "dow", "dom", "hour"]),
|
| 30 |
+
FieldSpec(name="description", field_type=FieldType.TEXT, prefix="DESC"),
|
| 31 |
+
],
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# =============================================================================
|
| 36 |
+
# E-COMMERCE SCHEMA — Adapted from ActionPiece + nuFormer patterns
|
| 37 |
+
# =============================================================================
|
| 38 |
+
|
| 39 |
+
ECOMMERCE_SCHEMA = DomainSchema(
|
| 40 |
+
name="ecommerce",
|
| 41 |
+
description=(
|
| 42 |
+
"E-commerce event schema adapted from ActionPiece (arXiv:2502.13581) "
|
| 43 |
+
"and nuFormer patterns. Events: view/cart/purchase/return/wishlist. "
|
| 44 |
+
"~16 tokens per event, 2048 context = ~128 events."
|
| 45 |
+
),
|
| 46 |
+
fields=[
|
| 47 |
+
FieldSpec(name="event_type", field_type=FieldType.CATEGORICAL_FIXED, prefix="EVT",
|
| 48 |
+
categories=["view", "add_to_cart", "purchase", "return", "wishlist"]),
|
| 49 |
+
FieldSpec(name="price", field_type=FieldType.NUMERICAL_CONTINUOUS, prefix="PRICE", n_bins=21),
|
| 50 |
+
FieldSpec(name="quantity", field_type=FieldType.NUMERICAL_DISCRETE, prefix="QTY", max_value=10),
|
| 51 |
+
FieldSpec(name="category", field_type=FieldType.CATEGORICAL_FIXED, prefix="CAT",
|
| 52 |
+
categories=[
|
| 53 |
+
"electronics", "clothing", "home_garden", "books", "sports",
|
| 54 |
+
"toys", "food_grocery", "health_beauty", "automotive", "office",
|
| 55 |
+
"pet_supplies", "jewelry", "music", "movies", "games",
|
| 56 |
+
"baby", "tools", "arts_crafts", "industrial", "other",
|
| 57 |
+
]),
|
| 58 |
+
FieldSpec(name="timestamp", field_type=FieldType.TEMPORAL,
|
| 59 |
+
calendar_fields=["month", "dow", "dom", "hour"]),
|
| 60 |
+
FieldSpec(name="product_title", field_type=FieldType.TEXT, prefix="TITLE"),
|
| 61 |
+
],
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# =============================================================================
|
| 66 |
+
# HEALTHCARE SCHEMA — Clinical event sequences
|
| 67 |
+
# =============================================================================
|
| 68 |
+
|
| 69 |
+
HEALTHCARE_SCHEMA = DomainSchema(
|
| 70 |
+
name="healthcare",
|
| 71 |
+
description=(
|
| 72 |
+
"Clinical event schema for healthcare sequences. "
|
| 73 |
+
"Events: diagnosis/procedure/lab/medication/visit."
|
| 74 |
+
),
|
| 75 |
+
fields=[
|
| 76 |
+
FieldSpec(name="event_type", field_type=FieldType.CATEGORICAL_FIXED, prefix="CLIN",
|
| 77 |
+
categories=[
|
| 78 |
+
"diagnosis", "procedure", "lab_result", "medication",
|
| 79 |
+
"visit_inpatient", "visit_outpatient", "visit_er",
|
| 80 |
+
"imaging", "referral", "discharge",
|
| 81 |
+
]),
|
| 82 |
+
FieldSpec(name="cost", field_type=FieldType.NUMERICAL_CONTINUOUS, prefix="COST", n_bins=21),
|
| 83 |
+
FieldSpec(name="severity", field_type=FieldType.CATEGORICAL_FIXED, prefix="SEV",
|
| 84 |
+
categories=["low", "moderate", "high", "critical"]),
|
| 85 |
+
FieldSpec(name="provider_type", field_type=FieldType.CATEGORICAL_FIXED, prefix="PROV",
|
| 86 |
+
categories=[
|
| 87 |
+
"pcp", "specialist", "surgeon", "er_physician",
|
| 88 |
+
"nurse_practitioner", "therapist", "pharmacist", "other",
|
| 89 |
+
]),
|
| 90 |
+
FieldSpec(name="timestamp", field_type=FieldType.TEMPORAL,
|
| 91 |
+
calendar_fields=["month", "dow", "dom"]),
|
| 92 |
+
FieldSpec(name="description", field_type=FieldType.TEXT, prefix="DESC"),
|
| 93 |
+
],
|
| 94 |
+
)
|