Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +92 -0
config.json +63 -0
label_config.json +30 -0
model.safetensors +3 -0
tokenizer.json +0 -0
tokenizer_config.json +14 -0

README.md ADDED Viewed

	@@ -0,0 +1,92 @@

+---
+license: mit
+language:
+  - en
+tags:
+  - legal
+  - glacier
+  - distillation
+  - sequence-classification
+pipeline_tag: text-classification
+datasets:
+  - glacier-legal/legal-distillation-data
+base_model: nlpaueb/legal-bert-base-uncased
+---
+# GLACIER glacier-document-classifier
+**Distilled legal AI model** for the [GLACIER pipeline](https://github.com/OrionDevPartners/glacier-legal-mcp) — Gated Legal Analysis, Citation Intelligence, Evidence Routing.
+## Model Description
+This model is distilled from Claude Opus 4.6 (via AWS Bedrock) into a lightweight transformer for fast, local inference. It handles **legal document type classification (complaint, motion, brief, etc.)** as part of the GLACIER 6-stage legal document production pipeline.
+- **Base model:** [nlpaueb/legal-bert-base-uncased](https://huggingface.co/nlpaueb/legal-bert-base-uncased)
+- **Task:** sequence-classification
+- **Labels:** 12 classes
+- **Max length:** 512 tokens
+## Labels
+- `complaint`
+- `answer`
+- `motion`
+- `brief`
+- `order`
+- `opinion`
+- `notice`
+- `subpoena`
+- `affidavit`
+- `demand_letter`
+- `bar_complaint`
+- `other`
+## Usage
+```python
+from glacier_distill.inference import GlacierPipeline
+pipeline = GlacierPipeline()
+result = pipeline.classify_document("your legal text here")
+print(result)
+```
+Or use directly with transformers:
+```python
+from transformers import pipeline
+classifier = pipeline("text-classification", model="glacier-legal/glacier-document-classifier")
+result = classifier("your legal text here")
+```
+## Training
+- **Teacher:** Claude Opus 4.6 (AWS Bedrock)
+- **Method:** Knowledge distillation (Hinton et al., 2015) with temperature=4.0, alpha=0.7
+- **Data:** CourtListener case law + synthetic labeled examples
+- **Framework:** HuggingFace Transformers + custom DistillationLoss
+## GLACIER Pipeline
+This model is part of the GLACIER pipeline stages:
+```
+Stage 1: QUERY    -> jurisdiction-router + document-classifier
+Stage 2: RESEARCH -> legal-ner (entity extraction)
+Stage 3: WDC #1   -> (full model review)
+Stage 4: DRAFT    -> legal-ner + citation-classifier
+Stage 5: WDC #2   -> hallucination-detector + citation-classifier
+Stage 6: FINAL    -> (human review)
+```
+## Limitations
+- Distilled models are optimized for US legal text (federal + state)
+- Not a substitute for full model review in GLACIER Stages 3/5
+- Citation hallucination detection is a pre-filter, not a replacement for external verification
+- Jurisdiction coverage: Florida, Mississippi, Federal (primary); other states (limited)
+## License
+MIT — Part of the GLACIER Legal AI Framework by Orion Dev Partners, LLC.

config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": null,
+  "eos_token_ids": 0,
+  "finetuning_task": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "is_decoder": false,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "pruned_heads": {},
+  "tie_word_embeddings": true,
+  "torchscript": false,
+  "transformers_version": "5.5.3",
+  "type_vocab_size": 2,
+  "use_bfloat16": false,
+  "use_cache": false,
+  "vocab_size": 30522
+}

label_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "label2id": {
+    "complaint": 0,
+    "answer": 1,
+    "motion": 2,
+    "brief": 3,
+    "order": 4,
+    "opinion": 5,
+    "notice": 6,
+    "subpoena": 7,
+    "affidavit": 8,
+    "demand_letter": 9,
+    "bar_complaint": 10,
+    "other": 11
+  },
+  "id2label": {
+    "0": "complaint",
+    "1": "answer",
+    "2": "motion",
+    "3": "brief",
+    "4": "order",
+    "5": "opinion",
+    "6": "notice",
+    "7": "subpoena",
+    "8": "affidavit",
+    "9": "demand_letter",
+    "10": "bar_complaint",
+    "11": "other"
+  }
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fc7e80124768afd7dbd78f29fac3c84cee9c3ffc849c70c5326aa5a5463f60c
+size 437989384

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}