Spaces:
Sleeping
Sleeping
| """ | |
| DistilBERT action item classifier. | |
| Fine-tuned on the AMI Meeting Corpus to classify sentences into: | |
| - action_item | |
| - decision | |
| - open_question | |
| - general | |
| For the hackathon demo we ship a rule-augmented zero-shot baseline | |
| using a pre-trained NLI model, which performs well out of the box. | |
| Swap `MODEL_NAME` to your fine-tuned checkpoint after training. | |
| """ | |
| from transformers import pipeline | |
| from typing import Optional | |
| import os | |
| MODEL_NAME = "cross-encoder/nli-deberta-v3-small" | |
| _classifier = None | |
| _classifier_type = "zero-shot" | |
| LABELS = ["action item", "decision", "open question", "general discussion"] | |
| LABEL_MAP = { | |
| "action item": "action_item", | |
| "decision": "decision", | |
| "open question": "open_question", | |
| "general discussion": "general", | |
| "action_item": "action_item", | |
| "open_question": "open_question" | |
| } | |
| def load_classifier(): | |
| global _classifier, _classifier_type | |
| if _classifier is None: | |
| model_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "scripts", "focusflow-classifier") | |
| if os.path.exists(model_path): | |
| print(f"Loading fine-tuned classifier from {model_path}") | |
| _classifier = pipeline( | |
| "text-classification", | |
| model=model_path, | |
| tokenizer=model_path, | |
| device=-1, | |
| ) | |
| _classifier_type = "fine-tuned" | |
| else: | |
| print("Fine-tuned model not found, falling back to zero-shot-classification.") | |
| _classifier = pipeline( | |
| "zero-shot-classification", | |
| model="typeform/distilbert-base-uncased-mnli", | |
| device=-1, | |
| ) | |
| _classifier_type = "zero-shot" | |
| return _classifier | |
| def classify_sentence(sentence: str, classifier) -> dict: | |
| """ | |
| Classify a single sentence and return label + confidence. | |
| Returns dict: { label, confidence } | |
| """ | |
| if len(sentence.strip()) < 8: | |
| return {"label": "general", "confidence": 1.0} | |
| if _classifier_type == "fine-tuned": | |
| result = classifier(sentence)[0] | |
| top_label = result["label"] | |
| top_score = result["score"] | |
| else: | |
| result = classifier(sentence, candidate_labels=LABELS, multi_label=False) | |
| top_label = result["labels"][0] | |
| top_score = result["scores"][0] | |
| return { | |
| "label": LABEL_MAP.get(top_label, "general"), | |
| "confidence": round(top_score, 3), | |
| } | |
| def classify_batch(sentences: list[str], classifier) -> list[dict]: | |
| """Classify a batch of sentences.""" | |
| return [classify_sentence(s, classifier) for s in sentences] | |
| def extract_structured_items( | |
| sentences: list[str], | |
| timestamps: list[float], | |
| speaker_labels: list[str], | |
| classifier, | |
| confidence_threshold: float = 0.6, | |
| ) -> dict: | |
| """ | |
| Run classifier over sentences and return structured action items, | |
| decisions, and open questions above confidence threshold. | |
| """ | |
| action_items = [] | |
| decisions = [] | |
| open_questions = [] | |
| for i, sentence in enumerate(sentences): | |
| result = classify_sentence(sentence, classifier) | |
| if result["confidence"] < confidence_threshold: | |
| continue | |
| entry = { | |
| "text": sentence, | |
| "confidence": result["confidence"], | |
| "timestamp": timestamps[i] if i < len(timestamps) else 0, | |
| "speaker": speaker_labels[i] if i < len(speaker_labels) else "Unknown", | |
| } | |
| if result["label"] == "action_item": | |
| action_items.append(entry) | |
| elif result["label"] == "decision": | |
| decisions.append(entry) | |
| elif result["label"] == "open_question": | |
| open_questions.append(entry) | |
| return { | |
| "action_items": action_items, | |
| "decisions": decisions, | |
| "open_questions": open_questions, | |
| } | |