Spaces:
Runtime error
Runtime error
| """ | |
| DFK Content Classification β HuggingFace Spaces (CPU Basic β Gratis) | |
| ===================================================================== | |
| Model : ggapar/Ministral-3-8B-Base-2512-DFK (LoRA adapter) | |
| Base : mistralai/Ministral-3-8B-Base-2512 (float32, CPU) | |
| GPU : CPU Basic (gratis, tanpa GPU) | |
| Catatan: Inference lebih lambat (~2-5 menit/request) karena CPU only | |
| """ | |
| import os | |
| import re | |
| import gc | |
| import torch | |
| import numpy as np | |
| import gradio as gr | |
| from collections import Counter | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, Mistral3ForConditionalGeneration | |
| from peft import PeftModel | |
| # ================================================================ | |
| # KONFIGURASI | |
| # ================================================================ | |
| BASE_MODEL = "mistralai/Ministral-3-8B-Base-2512" | |
| ADAPTER_REPO = "ggapar/Ministral-3-8B-Base-2512-DFK" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| SYSTEM_PROMPT = ( | |
| "Anda adalah sistem deteksi konten DFK (Disinformasi, Fitnah, Kebencian). " | |
| "Klasifikasikan teks ke dalam: Fakta, Disinformasi, Fitnah, atau Ujaran Kebencian. " | |
| "Berikan label dan penjelasan yang jelas." | |
| ) | |
| LABEL_INFO = { | |
| "fakta" : ("π’", "#dcfce7", "#166534", "Konten yang sesuai dengan fakta"), | |
| "disinformasi" : ("π΄", "#fee2e2", "#991b1b", "Informasi yang menyesatkan"), | |
| "fitnah" : ("π ", "#ffedd5", "#9a3412", "Tuduhan tanpa bukti"), | |
| "ujaran_kebencian": ("β«", "#f1f5f9", "#1e293b", "Konten menyerang kelompok tertentu"), | |
| "unknown" : ("βͺ", "#f8fafc", "#64748b", "Label tidak terdeteksi"), | |
| } | |
| # ================================================================ | |
| # LOAD MODEL β di CPU dulu, GPU dialokasikan saat inference | |
| # Dengan ZeroGPU, model di-load ke CPU saat startup | |
| # GPU baru dialokasikan saat fungsi @spaces.GPU dipanggil | |
| # ================================================================ | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| ADAPTER_REPO, | |
| trust_remote_code = True, | |
| token = HF_TOKEN or None, | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print("Loading base model (CPU, float32)...") | |
| # Ministral-3-8B menggunakan Mistral3 architecture (VLM) | |
| # Harus pakai Mistral3ForConditionalGeneration, bukan AutoModelForCausalLM | |
| base_model = Mistral3ForConditionalGeneration.from_pretrained( | |
| BASE_MODEL, | |
| dtype = torch.float32, # β CPU butuh float32 | |
| device_map = "cpu", | |
| trust_remote_code = True, | |
| token = HF_TOKEN or None, | |
| low_cpu_mem_usage = True, | |
| ) | |
| print("Loading LoRA adapter...") | |
| model = PeftModel.from_pretrained( | |
| base_model, | |
| ADAPTER_REPO, | |
| token = HF_TOKEN or None, | |
| ) | |
| model.eval() | |
| print("β Model loaded ke CPU β siap inference (estimasi 2-5 menit/request)") | |
| # ================================================================ | |
| # HELPER FUNCTIONS | |
| # ================================================================ | |
| def extract_label(text: str) -> str: | |
| t = text.lower().strip() | |
| if "ujaran kebencian" in t[:80] or "ujaran_kebencian" in t[:80]: | |
| return "ujaran_kebencian" | |
| m = re.search(r'label\s*:\s*\*{0,2}([\w\s]+?)\*{0,2}[.,]', t) | |
| if m: | |
| lbl = m.group(1).strip() | |
| for kw in ["ujaran kebencian", "disinformasi", "fitnah", "fakta"]: | |
| if kw in lbl: | |
| return kw.replace(" ", "_") | |
| for kw in ["ujaran kebencian", "disinformasi", "fitnah", "fakta"]: | |
| if kw in t[:80]: | |
| return kw.replace(" ", "_") | |
| for kw in ["ujaran kebencian", "disinformasi", "fitnah", "fakta"]: | |
| if kw in t: | |
| return kw.replace(" ", "_") | |
| return "unknown" | |
| def extract_reasoning(text: str) -> str: | |
| m = re.search(r'penjelasan\s*:\s*(.*)', text, re.DOTALL | re.IGNORECASE) | |
| if m: | |
| return m.group(1).strip() | |
| lines = text.strip().split('\n') | |
| return ' '.join(lines[1:]).strip() if len(lines) > 1 else text.strip() | |
| def compute_mtla_confidence(scores_list, gen_ids, K: int = 10) -> float: | |
| K_act = min(K, len(scores_list), len(gen_ids)) | |
| log_probs = [] | |
| for t in range(K_act): | |
| probs = torch.softmax(scores_list[t], dim=-1) | |
| tok_prob = probs[0, gen_ids[t].item()].item() | |
| log_probs.append(np.log(max(tok_prob, 1e-10))) | |
| avg_lp = float(np.mean(log_probs)) | |
| return round(float(1.0 / (1.0 + np.exp(-(avg_lp + 2.5) * 1.5))), 4) | |
| # ================================================================ | |
| # FUNGSI INFERENCE β decorator @spaces.GPU wajib untuk ZeroGPU | |
| # GPU dialokasikan hanya saat fungsi ini dipanggil | |
| # ================================================================ | |
| def classify_dfk(text: str, num_trials: int, temperature: float): | |
| if not text or not text.strip(): | |
| return ("β", "0%", "β", "β", "β", | |
| "Masukkan teks yang ingin diklasifikasi.", [], "") | |
| device = "cpu" | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": f"Klasifikasikan konten berikut:\n{text}"}, | |
| ] | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = tokenizer( | |
| [prompt] * int(num_trials), | |
| return_tensors = "pt", | |
| padding = True, | |
| truncation = True, | |
| max_length = 1900, | |
| ).to(device) | |
| with torch.inference_mode(): | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens = 256, | |
| temperature = float(temperature), | |
| do_sample = True, | |
| return_dict_in_generate = True, | |
| output_scores = True, | |
| use_cache = True, | |
| ) | |
| # Kumpulkan hasil per trial | |
| trials = [] | |
| for i in range(int(num_trials)): | |
| gen_ids = out.sequences[i][inputs.input_ids.shape[1]:] | |
| gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True) | |
| scores_i = [s[i:i+1] for s in out.scores] | |
| conf = compute_mtla_confidence(scores_i, gen_ids, K=10) | |
| trials.append({ | |
| "label" : extract_label(gen_text), | |
| "reasoning": extract_reasoning(gen_text), | |
| "confidence": conf, | |
| }) | |
| # Voting | |
| vote = Counter(t["label"] for t in trials) | |
| best_label, count = vote.most_common(1)[0] | |
| winners = [t for t in trials if t["label"] == best_label] | |
| avg_conf = float(np.mean([t["confidence"] for t in winners])) | |
| best_reason = max(winners, key=lambda x: x["confidence"])["reasoning"] | |
| is_ambiguous = count == 1 or avg_conf < 0.45 | |
| emoji, bg, fg, desc = LABEL_INFO.get(best_label, LABEL_INFO["unknown"]) | |
| label_display = f"{emoji} {best_label.upper().replace('_', ' ')}" | |
| conf_pct = f"{avg_conf * 100:.1f}%" | |
| consistency = f"{count}/{int(num_trials)}" | |
| ambig_status = "β οΈ Ambigu β model ragu-ragu" if is_ambiguous else "β Model yakin" | |
| label_html = f""" | |
| <div style=" | |
| background:{bg}; color:{fg}; | |
| padding:12px 24px; border-radius:12px; | |
| font-size:1.4rem; font-weight:700; | |
| text-align:center; display:inline-block; | |
| border: 2px solid {fg}30; margin:8px 0; | |
| "> | |
| {emoji} {best_label.upper().replace('_', ' ')} | |
| </div> | |
| """ | |
| trial_data = [ | |
| [ | |
| f"Trial {i+1}", | |
| f"{LABEL_INFO.get(t['label'], LABEL_INFO['unknown'])[0]} " | |
| f"{t['label'].upper().replace('_', ' ')}", | |
| f"{t['confidence'] * 100:.1f}%", | |
| t['reasoning'][:150] + "..." if len(t['reasoning']) > 150 else t['reasoning'], | |
| ] | |
| for i, t in enumerate(trials) | |
| ] | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| return ( | |
| label_display, conf_pct, consistency, | |
| ambig_status, desc, best_reason, | |
| trial_data, label_html, | |
| ) | |
| # ================================================================ | |
| # GRADIO UI | |
| # ================================================================ | |
| css = """ | |
| .gradio-container { max-width: 900px !important; margin: auto; } | |
| footer { display: none !important; } | |
| """ | |
| with gr.Blocks( | |
| title = "DFK Content Classifier", | |
| theme = gr.themes.Soft(primary_hue="red", neutral_hue="slate"), | |
| css = css, | |
| ) as demo: | |
| gr.HTML(""" | |
| <div style="text-align:center;padding:1.5rem 0 0.5rem"> | |
| <h1 style="font-size:2rem;font-weight:800;color:#1e293b;margin:0"> | |
| π‘οΈ DFK Content Classifier | |
| </h1> | |
| <p style="color:#64748b;margin:8px 0 4px"> | |
| Deteksi Disinformasi Β· Fitnah Β· Ujaran Kebencian Β· Fakta | |
| </p> | |
| <p style="color:#94a3b8;font-size:0.85rem;margin:0"> | |
| Model: <b>Ministral-3-8B</b> + LoRA Fine-tuning Β· Bahasa Indonesia | |
| </p> | |
| <div style="background:#fef9c3;color:#854d0e;padding:6px 16px;border-radius:8px;font-size:0.82rem;margin:8px auto;display:inline-block"> | |
| β³ CPU Mode β estimasi waktu inference: 2-5 menit per request | |
| </div> | |
| <div style="display:flex;justify-content:center;gap:8px;margin:12px 0;flex-wrap:wrap"> | |
| <span style="background:#dcfce7;color:#166534;padding:3px 12px;border-radius:20px;font-size:0.82rem">π’ Fakta</span> | |
| <span style="background:#fee2e2;color:#991b1b;padding:3px 12px;border-radius:20px;font-size:0.82rem">π΄ Disinformasi</span> | |
| <span style="background:#ffedd5;color:#9a3412;padding:3px 12px;border-radius:20px;font-size:0.82rem">π Fitnah</span> | |
| <span style="background:#f1f5f9;color:#1e293b;padding:3px 12px;border-radius:20px;font-size:0.82rem">β« Ujaran Kebencian</span> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label = "π Teks yang ingin diklasifikasi", | |
| placeholder = "Masukkan klaim, berita, atau konten...", | |
| lines = 5, | |
| ) | |
| with gr.Row(): | |
| num_trials = gr.Slider( | |
| minimum = 1, maximum = 5, value = 3, step = 1, | |
| label = "Jumlah Trial", | |
| info = "Lebih banyak = lebih akurat tapi lebih lambat", | |
| ) | |
| temperature = gr.Slider( | |
| minimum = 0.1, maximum = 1.0, value = 0.7, step = 0.1, | |
| label = "Temperature", | |
| info = "0.1 = deterministik, 1.0 = kreatif", | |
| ) | |
| classify_btn = gr.Button( | |
| "π Klasifikasi Sekarang", | |
| variant = "primary", | |
| size = "lg", | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Examples( | |
| examples = [ | |
| ["Air rebusan bawang putih bisa menyembuhkan virus COVID dalam 24 jam!"], | |
| ["BPOM mengkonfirmasi vaksin COVID-19 sudah melalui uji klinis tiga fase sesuai standar WHO."], | |
| ["Gubernur X terbukti korupsi dana bansos, ada bukti transfer ke rekening keluarganya."], | |
| ["Orang dari suku X itu memang tidak bisa dipercaya dan selalu bikin masalah."], | |
| ], | |
| inputs = text_input, | |
| label = "π‘ Contoh Teks", | |
| ) | |
| gr.HTML("<hr style='margin:1rem 0;border-color:#e2e8f0'>") | |
| label_html_out = gr.HTML() | |
| with gr.Row(): | |
| label_out = gr.Textbox( | |
| label="π·οΈ Label", interactive=False, | |
| ) | |
| conf_out = gr.Textbox( | |
| label="π― Trust Score (MTLA)", interactive=False, | |
| info="Keyakinan model via Multi-Token Logit Averaging", | |
| ) | |
| consistency_out = gr.Textbox( | |
| label="π³οΈ Konsistensi", interactive=False, | |
| ) | |
| ambig_out = gr.Textbox( | |
| label="π Status", interactive=False, | |
| ) | |
| desc_out = gr.Textbox( | |
| label="π Deskripsi Label", interactive=False, | |
| ) | |
| reasoning_out = gr.Textbox( | |
| label="π¬ Reasoning Model", lines=4, interactive=False, | |
| info="Penjelasan model tentang keputusannya", | |
| ) | |
| with gr.Accordion("π¬ Detail Per Trial", open=False): | |
| trial_table = gr.Dataframe( | |
| headers = ["Trial", "Label", "Trust Score", "Reasoning"], | |
| wrap = True, | |
| ) | |
| with gr.Accordion("π Cara Pakai via API", open=False): | |
| gr.Markdown(""" | |
| ### Python | |
| ```python | |
| from gradio_client import Client | |
| client = Client("ggapar/dfk-classifier") | |
| result = client.predict( | |
| text = "Teks yang ingin dicek", | |
| num_trials = 3, | |
| temperature = 0.7, | |
| api_name = "/classify_dfk" | |
| ) | |
| # result[0] = Label, result[1] = Trust Score, result[5] = Reasoning | |
| print(result[0], result[1], result[5]) | |
| ``` | |
| ### Install | |
| ```bash | |
| pip install gradio_client | |
| ``` | |
| """) | |
| gr.HTML(""" | |
| <div style="text-align:center;color:#94a3b8;font-size:0.78rem;margin-top:1rem"> | |
| Model: <a href="https://huggingface.co/ggapar/Ministral-3-8B-Base-2512-DFK" | |
| target="_blank">ggapar/Ministral-3-8B-Base-2512-DFK</a> Β· | |
| AITF Team 2025 | |
| </div> | |
| """) | |
| outputs = [ | |
| label_out, conf_out, consistency_out, | |
| ambig_out, desc_out, reasoning_out, | |
| trial_table, label_html_out, | |
| ] | |
| classify_btn.click(fn=classify_dfk, | |
| inputs=[text_input, num_trials, temperature], | |
| outputs=outputs) | |
| text_input.submit(fn=classify_dfk, | |
| inputs=[text_input, num_trials, temperature], | |
| outputs=outputs) | |
| if __name__ == "__main__": | |
| demo.launch() |