Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +106 -99
config.json +66 -18
model.safetensors +2 -2
tokenizer.json +0 -0
tokenizer_config.json +7 -5

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 license: mit
 tags:
 - text-classification
-- bert
 - orality
 - linguistics
 - rhetorical-analysis
@@ -12,7 +12,7 @@ metrics:
 - f1
 - accuracy
 base_model:
-- google-bert/bert-base-uncased
 pipeline_tag: text-classification
 library_name: transformers
 datasets:
@@ -25,16 +25,16 @@ model-index:
       name: Marker Subtype Classification
     metrics:
     - type: f1
-      value: 0.500
       name: F1 (macro)
     - type: accuracy
-      value: 0.498
       name: Accuracy
 ---
 # Havelock Marker Subtype Classifier
-BERT-based classifier for **71 fine-grained rhetorical marker subtypes** on the oral–literate spectrum, grounded in Walter Ong's *Orality and Literacy* (1982).
 This is the finest level of the Havelock span classification hierarchy. Given a text span identified as a rhetorical marker, the model classifies it into one of 71 specific rhetorical devices (e.g., `anaphora`, `epistemic_hedge`, `vocative`, `nested_clauses`).
@@ -42,14 +42,14 @@ This is the finest level of the Havelock span classification hierarchy. Given a
 | Property | Value |
 |----------|-------|
-| Base model | `bert-base-uncased` |
-| Architecture | `BertForSequenceClassification` |
 | Task | Multi-class classification (71 classes) |
 | Max sequence length | 128 tokens |
-| Test F1 (macro) | **0.500** |
-| Test Accuracy | **0.498** |
-| Missing labels (test) | 1/71 (`rhyme`) |
-| Parameters | ~109M |
 ## Usage
 ```python
@@ -100,7 +100,7 @@ print(f"Marker subtype: {model.config.id2label[pred]}")
 ### Data
-Span-level annotations from the Havelock corpus with marker types normalized against a canonical taxonomy at build time. Each span carries a `marker_subtype` field. Only subtypes with ≥50 examples are included. A stratified 80/10/10 train/val/test split was used with swap-based optimization to balance label distributions across splits. The test set contains 2,357 spans.
 ### Hyperparameters
@@ -115,7 +115,11 @@ Span-level annotations from the Havelock corpus with marker types normalized aga
 | Loss | Focal loss (γ=2.0) + class weights |
 | Mixout | 0.1 |
 | Mixed precision | FP16 |
-| Min examples per class | 50 |
 ### Test Set Classification Report
@@ -123,105 +127,106 @@ Span-level annotations from the Havelock corpus with marker types normalized aga
 ```
                         precision    recall  f1-score   support
-         abstract_noun      0.376     0.364     0.370        88
-       additive_formal      0.455     0.417     0.435        12
-         agent_demoted      0.533     0.800     0.640        10
-     agentless_passive      0.542     0.456     0.495        57
-          alliteration      0.714     0.500     0.588        10
-              anaphora      0.490     0.585     0.533        41
             antithesis      0.947     0.818     0.878        22
-                 aside      0.225     0.243     0.234        37
-             assonance      0.926     1.000     0.962        25
-             asyndeton      0.583     0.500     0.538        14
-     audience_response      0.778     0.700     0.737        10
- categorical_statement      0.209     0.450     0.286        20
-          causal_chain      0.425     0.405     0.415        42
-       causal_explicit      0.537     0.468     0.500        47
-              citation      0.794     0.587     0.675        46
-   conceptual_metaphor      0.176     0.077     0.107        39
-            concessive      0.617     0.644     0.630        45
-  concessive_connector      0.833     0.833     0.833        18
-           conditional      0.582     0.655     0.616        87
-        conflict_frame      0.588     0.667     0.625        15
-           contrastive      0.442     0.557     0.493        61
        cross_reference      0.733     0.458     0.564        24
-     definitional_move      0.333     0.200     0.250        10
-     discourse_formula      0.485     0.424     0.452       118
-        dramatic_pause      0.875     0.700     0.778        10
-       embodied_action      0.271     0.310     0.289        42
-           enumeration      0.556     0.581     0.568        43
-       epistemic_hedge      0.206     0.500     0.292        14
-            epistrophe      0.778     0.875     0.824        16
-               epithet      0.385     0.417     0.400        12
-      everyday_example      0.278     0.179     0.217        28
-            evidential      0.606     0.541     0.571        37
-    footnote_reference      0.444     0.400     0.421        10
-            imperative      0.628     0.590     0.608       100
-          inclusive_we      0.561     0.627     0.592        59
- institutional_subject      0.947     0.857     0.900        21
-  intensifier_doubling      0.905     0.864     0.884        22
-    lexical_repetition      0.447     0.467     0.457        45
-        list_structure      0.190     0.174     0.182        23
-         metadiscourse      0.073     0.182     0.104        22
-methodological_framing      0.500     0.238     0.323        21
-      named_individual      0.455     0.333     0.385        30
-        nested_clauses      0.294     0.326     0.309        46
-        nominalization      0.353     0.429     0.387        56
-   objectifying_stance      0.167     0.300     0.214        10
-           parallelism      0.188     0.222     0.203        27
-          phatic_check      0.444     0.364     0.400        11
-         phatic_filler      0.300     0.600     0.400        10
-          polysyndeton      1.000     0.833     0.909        24
-           probability      0.500     0.682     0.577        22
-               proverb      0.059     0.100     0.074        10
-   qualified_assertion      0.280     0.241     0.259        29
-               refrain      0.850     0.708     0.773        24
-        relative_chain      0.431     0.455     0.442        55
-     religious_formula      1.000     0.688     0.815        16
-   rhetorical_question      0.646     0.738     0.689        84
-                 rhyme      0.000     0.000     0.000        10
-                rhythm      1.000     0.625     0.769        16
-         second_person      0.573     0.474     0.519       116
-       self_correction      0.952     0.500     0.656        40
-        sensory_detail      0.538     0.350     0.424        20
-    simple_conjunction      0.133     0.200     0.160        10
-        specific_place      0.625     0.278     0.385        18
-technical_abbreviation      0.818     0.321     0.462        28
-        technical_term      0.438     0.432     0.435        74
-       temporal_anchor      0.472     0.500     0.486        34
-    temporal_embedding      0.475     0.604     0.532        48
-third_person_reference      0.692     0.900     0.783        10
-              tricolon      0.667     0.667     0.667        18
-               us_them      0.750     0.500     0.600        18
-              vocative      0.414     0.600     0.490        20
-              accuracy                          0.498      2357
-             macro avg      0.528     0.497     0.500      2357
-          weighted avg      0.525     0.498     0.502      2357
 ```
 </details>
-**Top performing subtypes (F1 ≥ 0.75):** `assonance` (0.962), `polysyndeton` (0.909), `institutional_subject` (0.900), `intensifier_doubling` (0.884), `antithesis` (0.878), `concessive_connector` (0.833), `epistrophe` (0.824), `religious_formula` (0.815), `third_person_reference` (0.783), `dramatic_pause` (0.778), `refrain` (0.773), `rhythm` (0.769).
-**Weakest subtypes (F1 < 0.20):** `rhyme` (0.000), `proverb` (0.074), `metadiscourse` (0.104), `simple_conjunction` (0.160), `list_structure` (0.182). These tend to be semantically diffuse classes that overlap heavily with neighbouring subtypes or have very low test support.
 ## Class Distribution
-The test set exhibits significant imbalance across 71 classes:
-| Support Range | Classes | % of Total |
-|---------------|---------|------------|
-| >100 | 3 (`discourse_formula`, `second_person`, `imperative`) | 4% |
-| 50–100 | 11 | 15% |
-| 25–50 | 26 | 37% |
-| 10–25 | 31 | 44% |
 ## Limitations
-- **71-way classification on ~22k spans**: The data budget per class is thin, particularly for classes near the 50-example minimum. More data or class consolidation would help.
 - **Semantic overlap**: Some subtypes are difficult to distinguish from surface text alone (e.g., `parallelism` vs `anaphora` vs `tricolon`; `epistemic_hedge` vs `qualified_assertion` vs `probability`). The model may benefit from hierarchical classification that conditions on type-level predictions.
-- **Recall-precision tradeoff on rare classes**: Many rare classes show high precision but lower recall (e.g., `self_correction`: P=0.952, R=0.500; `religious_formula`: P=1.000, R=0.688), suggesting the model learns narrow prototypes but misses variation.
 - **Span-level only**: Requires pre-extracted spans. Does not detect boundaries.
 - **128-token context window**: Longer spans are truncated.
@@ -235,7 +240,7 @@ The 71 subtypes represent the full granularity of the Havelock taxonomy, operati
 |-------|------|---------|-----|
 | [`HavelockAI/bert-marker-category`](https://huggingface.co/HavelockAI/bert-marker-category) | Binary (oral/literate) | 2 | 0.875 |
 | [`HavelockAI/bert-marker-type`](https://huggingface.co/HavelockAI/bert-marker-type) | Functional type | 18 | 0.583 |
-| **This model** | Fine-grained subtype | 71 | 0.500 |
 | [`HavelockAI/bert-orality-regressor`](https://huggingface.co/HavelockAI/bert-orality-regressor) | Document-level score | Regression | MAE 0.079 |
 | [`HavelockAI/bert-token-classifier`](https://huggingface.co/HavelockAI/bert-token-classifier) | Span detection (BIO) | 145 | 0.500 |
@@ -252,7 +257,9 @@ The 71 subtypes represent the full granularity of the Havelock taxonomy, operati
 ## References
 - Ong, Walter J. *Orality and Literacy: The Technologizing of the Word*. Routledge, 1982.
 ---
-*Model version: b31f147d · Trained: February 2026*

 license: mit
 tags:
 - text-classification
+- modernbert
 - orality
 - linguistics
 - rhetorical-analysis
 - f1
 - accuracy
 base_model:
+- answerdotai/ModernBERT-base
 pipeline_tag: text-classification
 library_name: transformers
 datasets:
       name: Marker Subtype Classification
     metrics:
     - type: f1
+      value: 0.493
       name: F1 (macro)
     - type: accuracy
+      value: 0.500
       name: Accuracy
 ---
 # Havelock Marker Subtype Classifier
+ModernBERT-based classifier for **71 fine-grained rhetorical marker subtypes** on the oral–literate spectrum, grounded in Walter Ong's *Orality and Literacy* (1982).
 This is the finest level of the Havelock span classification hierarchy. Given a text span identified as a rhetorical marker, the model classifies it into one of 71 specific rhetorical devices (e.g., `anaphora`, `epistemic_hedge`, `vocative`, `nested_clauses`).
 | Property | Value |
 |----------|-------|
+| Base model | `answerdotai/ModernBERT-base` |
+| Architecture | `ModernBertForSequenceClassification` |
 | Task | Multi-class classification (71 classes) |
 | Max sequence length | 128 tokens |
+| Test F1 (macro) | **0.493** |
+| Test Accuracy | **0.500** |
+| Missing labels (test) | 1/71 (`proverb`) |
+| Parameters | ~149M |
 ## Usage
 ```python
 ### Data
+22,367 span-level annotations from the Havelock corpus with marker types normalized against a canonical taxonomy at build time. Each span carries a `marker_subtype` field. Only subtypes with ≥10 examples are included. A stratified 80/10/10 train/val/test split was used with swap-based optimization to balance label distributions across splits. The test set contains 2,357 spans.
 ### Hyperparameters
 | Loss | Focal loss (γ=2.0) + class weights |
 | Mixout | 0.1 |
 | Mixed precision | FP16 |
+| Min examples per class | 10 |
+### Training Metrics
+Best checkpoint selected at epoch 15 by missing-label-primary, F1-tiebreaker (0 missing, F1 0.486).
 ### Test Set Classification Report
 ```
                         precision    recall  f1-score   support
+         abstract_noun      0.408     0.330     0.365        88
+       additive_formal      0.286     0.167     0.211        12
+         agent_demoted      0.667     1.000     0.800        10
+     agentless_passive      0.583     0.491     0.533        57
+          alliteration      0.500     0.200     0.286        10
+              anaphora      0.500     0.537     0.518        41
             antithesis      0.947     0.818     0.878        22
+                 aside      0.615     0.216     0.320        37
+             assonance      1.000     0.960     0.980        25
+             asyndeton      0.636     0.500     0.560        14
+     audience_response      1.000     0.800     0.889        10
+ categorical_statement      0.103     0.200     0.136        20
+          causal_chain      0.442     0.452     0.447        42
+       causal_explicit      0.400     0.468     0.431        47
+              citation      0.743     0.565     0.642        46
+   conceptual_metaphor      0.065     0.051     0.057        39
+            concessive      0.595     0.556     0.575        45
+  concessive_connector      0.882     0.833     0.857        18
+           conditional      0.596     0.609     0.602        87
+        conflict_frame      0.733     0.733     0.733        15
+           contrastive      0.533     0.525     0.529        61
        cross_reference      0.733     0.458     0.564        24
+     definitional_move      0.286     0.200     0.235        10
+     discourse_formula      0.405     0.508     0.451       118
+        dramatic_pause      0.833     0.500     0.625        10
+       embodied_action      0.375     0.214     0.273        42
+           enumeration      0.510     0.605     0.553        43
+       epistemic_hedge      0.102     0.357     0.159        14
+            epistrophe      0.824     0.875     0.848        16
+               epithet      0.333     0.250     0.286        12
+      everyday_example      0.312     0.179     0.227        28
+            evidential      0.667     0.432     0.525        37
+    footnote_reference      0.417     0.500     0.455        10
+            imperative      0.645     0.600     0.622       100
+          inclusive_we      0.630     0.576     0.602        59
+ institutional_subject      0.938     0.714     0.811        21
+  intensifier_doubling      0.944     0.773     0.850        22
+    lexical_repetition      0.417     0.556     0.476        45
+        list_structure      0.267     0.174     0.211        23
+         metadiscourse      0.085     0.182     0.116        22
+methodological_framing      0.500     0.190     0.276        21
+      named_individual      0.500     0.300     0.375        30
+        nested_clauses      0.500     0.348     0.410        46
+        nominalization      0.288     0.304     0.296        56
+   objectifying_stance      0.267     0.400     0.320        10
+           parallelism      0.350     0.259     0.298        27
+          phatic_check      0.500     0.364     0.421        11
+         phatic_filler      0.333     0.800     0.471        10
+          polysyndeton      1.000     0.792     0.884        24
+           probability      0.500     0.455     0.476        22
+               proverb      0.000     0.000     0.000        10
+   qualified_assertion      0.250     0.241     0.246        29
+               refrain      0.944     0.708     0.810        24
+        relative_chain      0.350     0.509     0.415        55
+     religious_formula      0.857     0.750     0.800        16
+   rhetorical_question      0.688     0.762     0.723        84
+                 rhyme      0.231     0.300     0.261        10
+                rhythm      0.909     0.625     0.741        16
+         second_person      0.571     0.586     0.579       116
+       self_correction      0.821     0.575     0.676        40
+        sensory_detail      0.364     0.200     0.258        20
+    simple_conjunction      0.167     0.300     0.214        10
+        specific_place      0.400     0.222     0.286        18
+technical_abbreviation      0.900     0.321     0.474        28
+        technical_term      0.426     0.703     0.531        74
+       temporal_anchor      0.396     0.618     0.483        34
+    temporal_embedding      0.500     0.562     0.529        48
+third_person_reference      0.700     0.700     0.700        10
+              tricolon      0.611     0.611     0.611        18
+               us_them      0.733     0.611     0.667        18
+              vocative      0.462     0.600     0.522        20
+              accuracy                          0.500      2357
+             macro avg      0.535     0.484     0.493      2357
+          weighted avg      0.532     0.500     0.503      2357
 ```
 </details>
+**Top performing subtypes (F1 ≥ 0.75):** `assonance` (0.980), `polysyndeton` (0.884), `antithesis` (0.878), `concessive_connector` (0.857), `intensifier_doubling` (0.850), `epistrophe` (0.848), `audience_response` (0.889), `institutional_subject` (0.811), `refrain` (0.810), `agent_demoted` (0.800), `religious_formula` (0.800), `conflict_frame` (0.733), `rhythm` (0.741), `rhetorical_question` (0.723).
+**Weakest subtypes (F1 < 0.20):** `proverb` (0.000), `conceptual_metaphor` (0.057), `metadiscourse` (0.116), `categorical_statement` (0.136), `epistemic_hedge` (0.159). These tend to be semantically diffuse classes that overlap heavily with neighbouring subtypes or have very low test support.
 ## Class Distribution
+The training set exhibits significant imbalance across 71 classes:
+| Support Range | Example Classes | Count |
+|---------------|-----------------|-------|
+| >1000 | `discourse_formula`, `second_person` | 2 |
+| 500–1000 | `conditional`, `rhetorical_question`, `technical_term`, `imperative` | 8 |
+| 200–500 | `abstract_noun`, `contrastive`, `inclusive_we`, `nominalization` | 27 |
+| 100–200 | `alliteration`, `antithesis`, `asyndeton`, `epistrophe`, `refrain` | 30 |
+| <100 | `footnote_reference`, `phatic_check`, `technical_abbreviation` | 4 |
 ## Limitations
+- **71-way classification on ~22k spans**: The data budget per class is thin, particularly for classes near the minimum. More data or class consolidation would help.
 - **Semantic overlap**: Some subtypes are difficult to distinguish from surface text alone (e.g., `parallelism` vs `anaphora` vs `tricolon`; `epistemic_hedge` vs `qualified_assertion` vs `probability`). The model may benefit from hierarchical classification that conditions on type-level predictions.
+- **Recall-precision tradeoff on rare classes**: Many rare classes show high precision but lower recall (e.g., `self_correction`: P=0.821, R=0.575; `technical_abbreviation`: P=0.900, R=0.321), suggesting the model learns narrow prototypes but misses variation.
 - **Span-level only**: Requires pre-extracted spans. Does not detect boundaries.
 - **128-token context window**: Longer spans are truncated.
 |-------|------|---------|-----|
 | [`HavelockAI/bert-marker-category`](https://huggingface.co/HavelockAI/bert-marker-category) | Binary (oral/literate) | 2 | 0.875 |
 | [`HavelockAI/bert-marker-type`](https://huggingface.co/HavelockAI/bert-marker-type) | Functional type | 18 | 0.583 |
+| **This model** | Fine-grained subtype | 71 | 0.493 |
 | [`HavelockAI/bert-orality-regressor`](https://huggingface.co/HavelockAI/bert-orality-regressor) | Document-level score | Regression | MAE 0.079 |
 | [`HavelockAI/bert-token-classifier`](https://huggingface.co/HavelockAI/bert-token-classifier) | Span detection (BIO) | 145 | 0.500 |
 ## References
 - Ong, Walter J. *Orality and Literacy: The Technologizing of the Word*. Routledge, 1982.
+- Lee, C. et al. "Mixout: Effective Regularization to Finetune Large-scale Pretrained Language Models." ICLR 2020.
+- Warner, A. et al. "Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference." 2024.
 ---
+*Trained: February 2026*

config.json CHANGED Viewed

@@ -1,16 +1,23 @@
 {
-  "add_cross_attention": false,
   "architectures": [
-    "BertForSequenceClassification"
   ],
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": null,
-  "classifier_dropout": null,
   "dtype": "float32",
-  "eos_token_id": null,
   "gradient_checkpointing": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "id2label": {
     "0": "LABEL_0",
@@ -85,9 +92,9 @@
     "69": "LABEL_69",
     "70": "LABEL_70"
   },
   "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "is_decoder": false,
   "label2id": {
     "LABEL_0": 0,
     "LABEL_1": 1,
@@ -161,16 +168,57 @@
     "LABEL_8": 8,
     "LABEL_9": 9
   },
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
   "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "tie_word_embeddings": true,
   "transformers_version": "5.0.0",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 30522
 }

 {
   "architectures": [
+    "ModernBertForSequenceClassification"
   ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
   "dtype": "float32",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
   "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
   "hidden_size": 768,
   "id2label": {
     "0": "LABEL_0",
     "69": "LABEL_69",
     "70": "LABEL_70"
   },
+  "initializer_cutoff_factor": 2.0,
   "initializer_range": 0.02,
+  "intermediate_size": 1152,
   "label2id": {
     "LABEL_0": 0,
     "LABEL_1": 1,
     "LABEL_8": 8,
     "LABEL_9": 9
   },
+  "layer_norm_eps": 1e-05,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "local_attention": 128,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
   "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
   "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "rope_parameters": {
+    "full_attention": {
+      "rope_theta": 160000.0,
+      "rope_type": "default"
+    },
+    "sliding_attention": {
+      "rope_theta": 10000.0,
+      "rope_type": "default"
+    }
+  },
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
   "tie_word_embeddings": true,
   "transformers_version": "5.0.0",
+  "vocab_size": 50368
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ff78a23e1f73a3c2b1b41f7b253d652d236d03395d41483f87deba0000c9124
-size 780277732

 version https://git-lfs.github.com/spec/v1
+oid sha256:3bd7dd251812991ee67b82e0da1ecbef1e9597bab1478f3f23ee327741272840
+size 1039849764

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,14 +1,16 @@
 {
   "backend": "tokenizers",
   "cls_token": "[CLS]",
-  "do_lower_case": true,
   "is_local": false,
   "mask_token": "[MASK]",
-  "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
   "unk_token": "[UNK]"
 }

 {
   "backend": "tokenizers",
+  "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "is_local": false,
   "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "tokenizer_class": "TokenizersBackend",
   "unk_token": "[UNK]"
 }