Spaces:

Reality8081
/

DemoApp

Sleeping

App Files Files Community

Reality8081 commited on 23 days ago

Commit

fdcfd15

1 Parent(s): 846a1b0

Update SRC basline and baseline_extractive

Browse files

Files changed (5) hide show

app.py +155 -117
src/model/baseline_extractive_model.py +45 -0
src/model/baseline_model.py +87 -0
src/preprocessing/edu_sentences.py +178 -0
src/utils/get_model.py +35 -0

app.py CHANGED Viewed

@@ -1,94 +1,132 @@
 import gradio as gr
-# ====================== TIỀN XỬ LÝ ======================
-def tien_xu_ly_theo_cau(van_ban: str) -> str:
-    """
-    Hàm tiền xử lý theo câu (Skeleton function).
-    Bạn có thể thay thế bằng code thật (ví dụ: tách câu bằng NLTK, spaCy, hoặc regex).
-    Hiện tại là mock: giữ nguyên văn bản gốc để demo.
-    """
-    # TODO: Thay code thật ở đây
-    # Ví dụ:
-    # import nltk
-    # nltk.download('punkt', quiet=True)
-    # cau = nltk.sent_tokenize(van_ban)
-    # return " ".join(cau[:10])  # giữ 10 câu đầu làm ví dụ
-    return van_ban
-def tien_xu_ly_theo_ngu_nghia(van_ban: str) -> str:
-    """
-    Hàm tiền xử lý theo ngữ nghĩa (Skeleton function).
-    Bạn có thể thay thế bằng code thật (ví dụ: cleaning, embedding, loại bỏ stopword, v.v.).
-    Hiện tại là mock: giữ nguyên văn bản gốc để demo.
-    """
-    # TODO: Thay code thật ở đây
-    # Ví dụ: dùng transformers hoặc spaCy để embedding hoặc semantic cleaning
-    return van_ban
-# ====================== MÔ HÌNH TÓM TẮT ======================
-def mo_hinh_baseline(van_ban_da_xu_ly: str) -> str:
-    """
-    Mô hình tóm tắt Baseline - PLACEHOLDER.
-    # TODO: Thay thế bằng mô hình thật (ví dụ: transformers pipeline)
-    # Ví dụ code thật:
-    # from transformers import pipeline
-    # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    # return summarizer(van_ban_da_xu_ly, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
-    """
-    # Mock để demo
-    return f"🔹 TÓM TẮT BASELINE:\n{van_ban_da_xu_ly[:250]}..."
-def mo_hinh_baseline_extractive(van_ban_da_xu_ly: str) -> str:
-    """
-    Mô hình Baseline + Extractive - PLACEHOLDER.
-    # TODO: Thay thế bằng code thật (extractive trước rồi abstractive)
-    """
-    # Mock để demo
-    return f"🔹 TÓM TẮT BASELINE + EXTRACTIVE:\n{van_ban_da_xu_ly[:220]}..."
-def mo_hinh_extractive_abstract(van_ban_da_xu_ly: str) -> str:
-    """
-    Mô hình Extractive + Abstract - PLACEHOLDER.
-    # TODO: Thay thế bằng code thật (thường dùng 2 bước: extractive → abstractive)
-    """
-    # Mock để demo
-    return f"🔹 TÓM TẮT EXTRACTIVE + ABSTRACT:\n{van_ban_da_xu_ly[:200]}..."
-# ====================== HÀM CHÍNH ======================
-def tom_tat_van_ban(
-    van_ban: str,
-    phuong_phap_tien_xu_ly: str,
-    mo_hinh: str
 ) -> str:
-    """
-    Hàm chính kết nối toàn bộ luồng:
-    Văn bản gốc → Tiền xử lý → Mô hình → Kết quả tóm tắt.
-    """
-    # Bước 1: Tiền xử lý
-    if phuong_phap_tien_xu_ly == "Tiền xử lý theo câu":
-        van_ban_da_xu_ly = tien_xu_ly_theo_cau(van_ban)
     else:
-        van_ban_da_xu_ly = tien_xu_ly_theo_ngu_nghia(van_ban)
-    # Bước 2: Chọn mô hình tóm tắt
-    if mo_hinh == "Mô hình baseline":
-        ket_qua = mo_hinh_baseline(van_ban_da_xu_ly)
-    elif mo_hinh == "Mô hình baseline có Extractive":
-        ket_qua = mo_hinh_baseline_extractive(van_ban_da_xu_ly)
-    else:  # Mô hình Extractive và Abstract
-        ket_qua = mo_hinh_extractive_abstract(van_ban_da_xu_ly)
-    return ket_qua
-# ====================== GIAO DIỆN GRADIO ======================
 with gr.Blocks(
-    title="Hệ thống Tóm tắt Văn bản Tự động",
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {max-width: 1200px; margin: auto;}
@@ -97,88 +135,88 @@ with gr.Blocks(
 ) as demo:
     gr.Markdown(
         """
-        # 🚀 Hệ thống Tóm tắt Văn bản Tự động
-        **Nhập văn bản cần tóm tắt → Chọn phương pháp & mô hình → Nhận kết quả ngay lập tức**
         """
     )
     with gr.Row():
         with gr.Column(scale=3):
             input_text = gr.Textbox(
-                label="📝 Văn bản cần tóm tắt",
-                placeholder="Dán đoạn văn bản dài vào đây (có thể vài nghìn từ)...",
                 lines=12,
                 max_lines=30,
                 show_copy_button=True
             )
         with gr.Column(scale=1):
-            gr.Markdown("### ⚙️ Cài đặt")
-            phuong_phap = gr.Radio(
                 choices=[
-                    "Tiền xử lý theo câu",
-                    "Tiền xử lý theo ngữ nghĩa"
                 ],
-                value="Tiền xử lý theo câu",
-                label="Phương pháp Tiền xử lý",
-                info="Chọn cách làm sạch văn bản trước khi tóm tắt"
             )
-            mo_hinh = gr.Radio(
                 choices=[
-                    "Mô hình baseline",
-                    "Mô hình baseline có Extractive",
-                    "Mô hình Extractive và Abstract"
                 ],
-                value="Mô hình baseline",
-                label="Mô hình Tóm tắt",
-                info="Chọn mô hình bạn muốn sử dụng"
             )
     with gr.Row():
         btn_tom_tat = gr.Button(
-            "🔍 Tóm tắt ngay",
             variant="primary",
             size="large"
         )
     output_text = gr.Textbox(
-        label="📄 Kết quả tóm tắt",
         lines=10,
-        placeholder="Kết quả sẽ hiển thị ở đây...",
         show_copy_button=True
     )
-    # Kết nối nút bấm
     btn_tom_tat.click(
-        fn=tom_tat_van_ban,
-        inputs=[input_text, phuong_phap, mo_hinh],
         outputs=output_text
     )
-    # Ví dụ minh họa
     gr.Examples(
         examples=[
-            ["Hà Nội là thủ đô của Việt Nam. Thành phố này có lịch sử hơn 1000 năm. Dân số khoảng 8 triệu người. Đây là trung tâm chính trị, kinh tế và văn hóa của cả nước."],
-            ["Trí tuệ nhân tạo (AI) đang thay đổi thế giới. Nhiều công ty lớn đang đầu tư mạnh vào lĩnh vực này. Gradio giúp lập trình viên nhanh chóng xây dựng giao diện web cho các mô hình AI."],
         ],
         inputs=input_text,
-        label="📌 Ví dụ thử nghiệm"
     )
     gr.Markdown(
         """
         ---
-        💡 **Hướng dẫn sử dụng**:
-        1. Dán văn bản cần tóm tắt vào ô bên trái.
-        2. Chọn phương pháp tiền xử lý và mô hình.
-        3. Nhấn **Tóm tắt ngay**.
-        Kết quả sẽ xuất hiện ngay lập tức (hiện đang dùng mock để demo).
         """
     )
-# Khởi chạy ứng dụng
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import torch
+from transformers import BartForConditionalGeneration, BartTokenizer
+import re
+import numpy as np
+import networkx as nx
+from typing import List, Dict
+from src.utils.get_model import get_summarizer
+from src.preprocessing.edu_sentences import preprocess_external_text
+from src.utils.get_model import get_extractive_model
+from src.model.baseline_extractive_model import get_trigrams
+REPO_ID_baseline_model = "Reality8081/bart-base"
+REPO_ID_baseline_model_edu = "Reality8081/bart-base-edu"
+REPO_ID_baseline_extractive_model = "Reality8081/bart-extractive"
+REPO_ID_baseline_extractive_model_edu = "Reality8081/bart-extractive-edu"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def model_baseline(prepro_dict: Dict) -> str:
+    text_to_summarize = prepro_dict.get("article", "")
+    if not text_to_summarize.strip():
+        return "Lỗi: Không tìm thấy văn bản để tóm tắt."
+    segmentation_method = prepro_dict.get("segmentation_method")
+    if segmentation_method == "edu":
+        repo_id = REPO_ID_baseline_model_edu
+    else:
+        repo_id = REPO_ID_baseline_model
+    summarizer = get_summarizer(repo_id)
+    summary = summarizer.summarize(text_to_summarize)
+    return summary
+def model_baseline_extractive(prepro_dict: Dict, top_n = 5) -> str:
+    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+    segments = prepro_dict["segments"]
+    if not segments:
+        return "Không thể phân tách văn bản thành các câu/EDU."
+    input_ids = torch.tensor([prepro_dict["input_ids"]]).to(device)
+    attention_mask = torch.tensor([prepro_dict["attention_mask"]]).to(device)
+    segmentation_method = prepro_dict.get("segmentation_method")
+    if segmentation_method == "edu":
+        repo_id = REPO_ID_baseline_extractive_model_edu
+    else:
+        repo_id = REPO_ID_baseline_extractive_model
+    model = get_summarizer(repo_id=repo_id, base_model_name="facebook/bart-large", device=device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        # Sử dụng Sigmoid đưa logit về khoảng (0, 1) để lấy xác suất
+        probs = torch.sigmoid(outputs['logits']).squeeze(0).cpu().numpy()
+    segment_scores = []
+    current_idx = 1 # Bỏ qua token đặc biệt <s> ở đầu chuỗi
+    for seg in segments:
+        # Lấy số lượng token của đoạn hiện tại
+        seg_len = len(tokenizer.encode(seg, add_special_tokens=False))
+        end_idx = min(current_idx + seg_len, len(probs))
+        if current_idx < len(probs):
+            # Điểm của câu là trung bình cộng xác suất các token bên trong nó
+            seg_score = np.mean(probs[current_idx:end_idx])
+        else:
+            seg_score = 0.0
+        segment_scores.append(seg_score)
+        current_idx += seg_len
+    ranked_indices = np.argsort(segment_scores)[::-1] # Sắp xếp giảm dần theo điểm
+    selected_indices = []
+    selected_trigrams = set()
+    for idx in ranked_indices:
+        candidate_seg = segments[idx]
+        candidate_trigrams = get_trigrams(candidate_seg)
+        # Chỉ chọn nếu segment này không trùng lặp các cụm từ (trigram) so với những đoạn đã chọn
+        if not candidate_trigrams.intersection(selected_trigrams):
+            selected_indices.append(idx)
+            selected_trigrams.update(candidate_trigrams)
+        # Dừng lại nếu đã gom đủ số câu theo yêu cầu
+        if len(selected_indices) == top_n:
+            break
+    # Sắp xếp lại thứ tự index xuất hiện của các câu trong văn bản gốc để tóm tắt được mạch lạc
+    selected_indices = sorted(selected_indices)
+    # Lắp ráp kết quả
+    extractive_summary = " ".join([segments[i] for i in selected_indices if i < len(segments)])
+    return extractive_summary
+def model_extractive_abstract(prepro_dict: Dict) -> str:
+    return prepro_dict
+# ====================== MAIN FUNCTION ======================
+def ATS(
+    text: str,
+    segmentation_method: str = None,
+    model: str = None,
+    reference_summary: str = None
 ) -> str:
+    """Main workflow: Raw Text → Preprocessing → Model"""
+    # Step 1: Preprocessing
+    if segmentation_method == "Sentence-based Preprocessing":
+        prepro_dict = preprocess_external_text(text, reference_summary, segmentation_method='sentence')
     else:
+        prepro_dict = preprocess_external_text(text, reference_summary, segmentation_method='edu')
+    # Step 2: Chọn model
+    if model == "Baseline Model: TextRank + Vanilla BART":
+        result = model_baseline(prepro_dict)
+    elif model == "Baseline Model with Extractive":
+        result = model_baseline_extractive(prepro_dict)
+    else:
+        result = model_extractive_abstract(prepro_dict)
+    return result
+# ====================== GRADIO INTERFACE ======================
 with gr.Blocks(
+    title="Automated Text Summarization System",
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {max-width: 1200px; margin: auto;}
 ) as demo:
     gr.Markdown(
         """
+        # 🚀 Automated Text Summarization System
+        **Input text → Select method & model → Get results instantly**
         """
     )
     with gr.Row():
         with gr.Column(scale=3):
             input_text = gr.Textbox(
+                label="📝 Text to Summarize",
+                placeholder="Paste your long text here (up to several thousand words)...",
                 lines=12,
                 max_lines=30,
                 show_copy_button=True
             )
         with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Settings")
+            method = gr.Radio(
                 choices=[
+                    "Sentence-based Preprocessing",
+                    "Semantic-based Preprocessing (EDU)"
                 ],
+                value="Sentence-based Preprocessing",
+                label="Preprocessing Method",
+                info="Choose how to clean the text before summarization"
             )
+            model = gr.Radio(
                 choices=[
+                    "Baseline Model",
+                    "Baseline Model with Extractive",
+                    "Extractive and Abstractive Model"
                 ],
+                value="Baseline Model",
+                label="Summarization Model",
+                info="Select the model you want to use"
             )
     with gr.Row():
         btn_tom_tat = gr.Button(
+            "🔍 Summarize Now",
             variant="primary",
             size="large"
         )
     output_text = gr.Textbox(
+        label="📄 Summary Result",
         lines=10,
+        placeholder="The result will appear here...",
         show_copy_button=True
     )
+    # Connect button click
     btn_tom_tat.click(
+        fn=ATS,
+        inputs=[input_text, method, model],
         outputs=output_text
     )
+    # Examples
     gr.Examples(
         examples=[
+            ["Hanoi is the capital of Vietnam. The city has a history of over 1000 years. The population is about 8 million people. It is the political, economic, and cultural center of the country."],
+            ["Artificial Intelligence (AI) is changing the world. Many large companies are investing heavily in this field. Gradio helps developers quickly build web interfaces for AI models."],
         ],
         inputs=input_text,
+        label="📌 Test Examples"
     )
     gr.Markdown(
         """
         ---
+        💡 **User Guide**:
+        1. Paste the text to be summarized into the left box.
+        2. Select the preprocessing method and model.
+        3. Click **Summarize Now**.
+        The result will appear instantly (currently using mock data for demo).
         """
     )
+# Launch the app
 if __name__ == "__main__":
     demo.launch()

src/model/baseline_extractive_model.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from transformers import BartModel, BartTokenizer
+class BartExtractiveSummarizer(nn.Module):
+    def __init__(self, model_name="facebook/bart-large"):
+        super(BartExtractiveSummarizer, self).__init__()
+        self.encoder = BartModel.from_pretrained(model_name).encoder
+        hidden_size = self.encoder.config.hidden_size
+        self.classifier = nn.Linear(hidden_size, 1)
+    def forward(self, input_ids, attention_mask, saliency_mask=None, **kwargs):
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        hidden_states = encoder_outputs.last_hidden_state
+        logits = self.classifier(hidden_states).squeeze(-1)
+        loss = None
+        if saliency_mask is not None:
+            active_loss = attention_mask.view(-1) == 1
+            active_logits = logits.view(-1)[active_loss]
+            active_labels = saliency_mask.view(-1)[active_loss].float()
+            # --- TỐI ƯU: TỰ ĐỘNG TÍNH CLASS WEIGHT CHO TỪNG BATCH ---
+            num_pos = active_labels.sum()
+            num_neg = active_labels.size(0) - num_pos
+            if num_pos > 0:
+                weight = torch.clamp(num_neg / num_pos, max=10.0)
+            else:
+                weight = torch.tensor(1.0).to(logits.device)
+            loss_fct = nn.BCEWithLogitsLoss(pos_weight=weight)
+            loss = loss_fct(active_logits, active_labels)
+        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
+def get_trigrams(text: str):
+    """Tạo tập hợp các cụm 3 từ liên tiếp từ một đoạn văn bản (Trigram Blocking)"""
+    words = text.lower().split()
+    return set(tuple(words[i:i+3]) for i in range(len(words)-2))

src/model/baseline_model.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import networkx as nx
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import torch
+from transformers import BartForConditionalGeneration, BartTokenizer
+def textrank_summarize(sentences, top_n=3):
+    """
+    Trích xuất câu quan trọng bằng TextRank + TF-IDF.
+    Đầu vào 'sentences' là một list các câu (hoặc EDUs) trong một văn bản.
+    """
+    # Xử lý trường hợp bài báo quá ngắn
+    if len(sentences) <= top_n:
+        return " ".join(sentences)
+    try:
+        # Bước 1: Khởi tạo TfidfVectorizer và fit_transform tập sentences của 1 bài báo
+        vectorizer = TfidfVectorizer(stop_words='english')
+        tfidf_matrix = vectorizer.fit_transform(sentences)
+        # Bước 2: Tính ma trận tương đồng Cosine
+        similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
+        # Bước 3: Đưa ma trận vào networkx tạo đồ thị và tính PageRank
+        nx_graph = nx.from_numpy_array(similarity_matrix)
+        scores = nx.pagerank(nx_graph)
+        # Bước 4: Sắp xếp điểm số và chọn top_n câu
+        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
+        # Giữ đúng thứ tự xuất hiện của câu trong văn bản gốc để dễ đọc
+        top_sentences_indices = sorted([sentences.index(ranked_sentences[i][1]) for i in range(top_n)])
+        summary = " ".join([sentences[i] for i in top_sentences_indices])
+        return summary
+    except Exception as e:
+        # Fallback về Lead-N nếu đồ thị lỗi (do câu rỗng hoặc không có từ vựng)
+        return " ".join(sentences[:top_n])
+class BartSummarizer:
+    def __init__(self, model_path="facebook/bart-base"):
+        """
+        Khởi tạo mô hình và tokenizer.
+        model_path có thể là repo trên Hugging Face hoặc đường dẫn local chứa weights.
+        """
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Loading BART model from '{model_path}' onto {self.device}...")
+        self.tokenizer = BartTokenizer.from_pretrained(model_path)
+        self.model = BartForConditionalGeneration.from_pretrained(model_path)
+        self.model.to(self.device)
+        self.model.eval() # Chuyển sang chế độ inference ngay từ đầu
+    def summarize(self, text, max_input_length=512, max_output_length=128, min_output_length=30):
+        """
+        Hàm sinh tóm tắt cho một đoạn văn bản đầu vào.
+        """
+        with torch.no_grad():
+            # Cắt ngắn đầu vào để chống quá tải GPU, đồng bộ với lúc train
+            inputs = self.tokenizer(
+                text,
+                max_length=max_input_length,
+                truncation=True,
+                padding=True,
+                return_tensors="pt"
+            ).to(self.device)
+            # Sinh văn bản tóm tắt
+            summary_ids = self.model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                max_length=max_output_length,
+                min_length=min_output_length,
+                num_beams=4,
+                length_penalty=2.0,       # Ưu tiên sinh câu trọn vẹn
+                no_repeat_ngram_size=3,   # Chống ảo giác, lặp từ
+                early_stopping=True
+            )
+            # Decode kết quả về dạng text
+            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            return summary
+# Cách gọi trong app:
+# summarizer = BartSummarizer("duong_dan_model_cua_ban_tren_huggingface")
+# result = summarizer.summarize("Đoạn văn bản cần tóm tắt...")

src/preprocessing/edu_sentences.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# ========================== preprocessing_utils.py ==========================
+import re
+import nltk
+import numpy as np
+import spacy
+from transformers import BartTokenizer
+from rouge_score import rouge_scorer
+from typing import List, Dict, Optional, Union
+nltk.download('punkt', quiet=True)
+nltk.download('punkt_tab', quiet=True)
+# Tải SpaCy một lần duy nhất (nhẹ, disable các thành phần không cần)
+nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler", "tok2vec"])
+tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+def clean_text(text: str) -> str:
+    """Làm sạch văn bản (dùng chung cho mọi pipeline)"""
+    if not isinstance(text, str):
+        return ""
+    # Xóa URL, email, twitter handle
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'\S+@\S+', '', text)
+    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
+    # Giữ lại chữ, số, dấu câu cơ bản
+    text = re.sub(r'[^\w\s.,;:\'"-?!]', '', text)
+    # Chuẩn hóa khoảng trắng
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def segment_text(text: str, method: str = 'sentence') -> tuple[List[str], str]:
+    """
+    Phân tách văn bản theo phương pháp được chọn.
+    Trả về: (list_segments, cleaned_text)
+    """
+    cleaned = clean_text(text)
+    if method == 'sentence':
+        segments = nltk.sent_tokenize(cleaned)
+        return segments, cleaned
+    elif method == 'edu':
+        # Giống hệt logic notebook EDU (tách câu trước → EDU bằng SpaCy)
+        sentences = nltk.sent_tokenize(cleaned)
+        processed_docs = list(nlp.pipe(sentences, batch_size=500))
+        all_edus = []
+        for doc in processed_docs:
+            temp_edus, current_segment = [], []
+            for token in doc:
+                current_segment.append(token.text_with_ws)
+                if (token.pos_ in ["SCONJ", "CCONJ"] or token.text in [",", ";"]) and len(current_segment) > 3:
+                    temp_edus.append("".join(current_segment).strip())
+                    current_segment = []
+            if current_segment:
+                temp_edus.append("".join(current_segment).strip())
+            all_edus.extend(temp_edus if temp_edus else [doc.text])
+        return all_edus, cleaned
+    else:
+        raise ValueError("method phải là 'sentence' hoặc 'edu'")
+def greedy_rouge_selection(segments: List[str], reference_summary: str, top_k: int = 3) -> List[int]:
+    """Thuật toán Greedy ROUGE (dùng chung)"""
+    selected_indices = []
+    best_rouge = 0.0
+    if not segments:
+        return []
+    for _ in range(min(top_k, len(segments))):
+        best_idx = -1
+        current_best = best_rouge
+        for i, seg in enumerate(segments):
+            if i in selected_indices:
+                continue
+            candidate = " ".join([segments[j] for j in selected_indices] + [seg])
+            scores = scorer.score(reference_summary, candidate)
+            avg_f = (scores['rouge1'].fmeasure +
+                     scores['rouge2'].fmeasure +
+                     scores['rougeL'].fmeasure) / 3.0
+            if avg_f > current_best:
+                current_best = avg_f
+                best_idx = i
+        if best_idx != -1:
+            selected_indices.append(best_idx)
+            best_rouge = current_best
+        else:
+            break
+    return [1 if i in selected_indices else 0 for i in range(len(segments))]
+def create_saliency_mask(input_ids: List[int], segments: List[str],
+                        ext_labels: List[int], tokenizer) -> List[int]:
+    """Tạo Saliency Mask từ segment-level xuống token-level"""
+    mask = np.zeros(len(input_ids), dtype=int)
+    mask[0] = 1
+    if input_ids and input_ids[-1] == tokenizer.eos_token_id:
+        mask[-1] = 1
+    current_idx = 1
+    for seg_idx, segment in enumerate(segments):
+        if current_idx >= len(input_ids) - 1:
+            break
+        seg_tokens = tokenizer.encode(segment, add_special_tokens=False)
+        token_len = len(seg_tokens)
+        if seg_idx < len(ext_labels) and ext_labels[seg_idx] == 1:
+            end_idx = min(current_idx + token_len, len(input_ids) - 1)
+            mask[current_idx:end_idx] = 1
+        current_idx += token_len
+    return mask.tolist()
+def preprocess_external_text(
+    text: str,
+    reference_summary: Optional[str] = None,
+    segmentation_method: str = 'sentence',
+    top_k: int = 3,
+    max_length: int = 1024
+) -> Dict:
+    segments, cleaned_article = segment_text(text, method=segmentation_method)
+    inputs = tokenizer(cleaned_article, max_length=max_length, truncation=True, padding=False)
+    result = {
+        "article": cleaned_article,
+        "segments": segments,                    # ← list câu hoặc list EDU
+        "segmentation_method": segmentation_method,
+        "input_ids": inputs["input_ids"],
+        "attention_mask": inputs["attention_mask"],
+    }
+    # Nếu có tóm tắt tham chiếu → tính nhãn extractive
+    if reference_summary is not None:
+        ref_clean = clean_text(reference_summary)
+        extractive_labels = greedy_rouge_selection(segments, ref_clean, top_k=top_k)
+        saliency_mask = create_saliency_mask(inputs["input_ids"], segments, extractive_labels, tokenizer)
+        targets = tokenizer(ref_clean, max_length=128, truncation=True, padding=False)
+        result.update({
+            "extractive_labels": extractive_labels,
+            "saliency_mask": saliency_mask,
+            "labels": targets["input_ids"],      # cho phần Abstractive
+            "reference_summary": ref_clean
+        })
+    return result
+def preprocess_batch(
+    texts: List[str],
+    reference_summaries: Optional[List[str]] = None,
+    segmentation_method: str = 'sentence',
+    top_k: int = 3
+) -> List[Dict]:
+    """Xử lý nhiều văn bản cùng lúc (dùng cho demo batch)"""
+    if reference_summaries is None:
+        reference_summaries = [None] * len(texts)
+    if len(reference_summaries) != len(texts):
+        raise ValueError("Số lượng reference_summaries phải bằng số lượng texts")
+    return [
+        preprocess_external_text(txt, ref, segmentation_method, top_k)
+        for txt, ref in zip(texts, reference_summaries)
+    ]

src/utils/get_model.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from src.model.baseline_model import BartSummarizer
+from src.model.baseline_extractive_model import BartExtractiveSummarizer
+loaded_summarizers = {}
+import torch
+from huggingface_hub import hf_hub_download
+def get_summarizer(repo_id: str):
+    if repo_id not in loaded_summarizers:
+        loaded_summarizers[repo_id] = BartSummarizer(model_path=repo_id)
+    return loaded_summarizers[repo_id]
+def get_extractive_model(repo_id: str, base_model_name: str = "facebook/bart-large", device: torch.device = "cpu"):
+    """Tải và lưu cache mô hình Custom Extractive từ Hugging Face Hub"""
+    if repo_id not in loaded_summarizers:
+        print(f"Đang tải mô hình Extractive từ repo: {repo_id}...")
+        # Khởi tạo khung kiến trúc trống
+        model = BartExtractiveSummarizer(model_name=base_model_name)
+        # Sử dụng hf_hub_download để kéo file trọng số về local cache
+        model_path = hf_hub_download(repo_id=repo_id, filename="model_state.bin")
+        # Load trọng số vào model
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        model.to(device)
+        model.eval() # Chuyển mô hình sang chế độ inference
+        loaded_summarizers[repo_id] = model
+    return loaded_summarizers[repo_id]