Spaces:

AndreyForty
/

SHAD_Homework

Running

App Files Files Community

AndreyForty commited on 13 days ago

Commit

ed5c425

verified ·

1 Parent(s): 9eb9bb1

Upload 5 files

Browse files

Files changed (5) hide show

README.md +43 -20
app.py +446 -0
paper_classifier.py +73 -0
requirements.txt +7 -3
train_distilbert.py +229 -0

README.md CHANGED Viewed

@@ -1,20 +1,43 @@
----
-title: SHAD Homework
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Задание шад Моисейкин Андрей
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+Реализация задания из ноутбука через `streamlit` и `finetune` модели
+`distilbert/distilbert-base-cased` для классификации научный статей
+## Что за файлики
+- `train_distilbert.py` - на датасете архива `arxivData.json` из кагле.
+- `app.py` - веб-интерфейс на streamlit, который загружает уже обученный чекпонинт
+- `paper_classifier.py` - общие константы, примеры
+Используются поля:
+- `title`
+- `summary`
+- `tag`
+## Обучение
+```bash
+conda activate main
+pip install -r requirements.txt
+python train_distilbert.py
+```
+По умолчанию checkpoint будет сохранён в `artifacts/distilbert-arxiv`.
+## Запуск streamlit
+После обучения:
+```bash
+conda activate main
+streamlit run app.py --server.port 8080
+```
+После запуска откройте `http://localhost:8080`.
+## Как работает инференс
+- модель читает `title` и `abstract`
+- если `abstract` пустой, используется только название статьи
+- сервис показывает только те классы, которые суммарно набирают `95%` вероятности по категориям, иначе гг.

app.py ADDED Viewed

	@@ -0,0 +1,446 @@

+from __future__ import annotations
+import os
+from pathlib import Path
+import streamlit as st
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from paper_classifier import (
+    BASE_MODEL_NAME,
+    DEFAULT_MODEL_DIR,
+    EXAMPLES,
+    EXPECTED_ARXIV_CATEGORIES,
+    MAX_LENGTH,
+    TOP_P_THRESHOLD,
+    format_input_text,
+    take_top_p,
+)
+MODEL_DIR = Path(os.environ.get("ARXIV_MODEL_DIR", DEFAULT_MODEL_DIR))
+@st.cache_resource(show_spinner=False)
+def load_model_bundle() -> tuple[AutoTokenizer, AutoModelForSequenceClassification]:
+    config_path = MODEL_DIR / "config.json"
+    if not config_path.exists():
+        raise FileNotFoundError(
+            f"Не найден fine-tuned checkpoint в {MODEL_DIR}. Сначала обучите модель через train_distilbert.py."
+        )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR.as_posix())
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR.as_posix())
+    model.eval()
+    return tokenizer, model
+def predict_topics(title: str, abstract: str) -> list[dict[str, float]]:
+    article_text = format_input_text(title, abstract)
+    if not article_text:
+        raise ValueError("Введите хотя бы название статьи или abstract.")
+    tokenizer, model = load_model_bundle()
+    inputs = tokenizer(
+        article_text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_LENGTH,
+    )
+    device = next(model.parameters()).device
+    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
+    with torch.inference_mode():
+        logits = model(**inputs).logits[0]
+        probabilities = torch.softmax(logits, dim=-1).cpu().tolist()
+    id2label = getattr(model.config, "id2label", None) or {
+        index: f"Label {index}" for index in range(len(probabilities))
+    }
+    records = [
+        {
+            "label": str(id2label.get(index, f"Label {index}")),
+            "score": float(score),
+        }
+        for index, score in enumerate(probabilities)
+    ]
+    records.sort(key=lambda record: record["score"], reverse=True)
+    return take_top_p(records, TOP_P_THRESHOLD)
+def apply_styles() -> None:
+    st.markdown(
+        """
+        <style>
+        @import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;600;700;800&family=IBM+Plex+Mono:wght@400;500&display=swap');
+        :root {
+            --paper: rgba(22, 27, 34, 0.92);
+            --card: rgba(30, 36, 46, 0.88);
+            --ink: #e6edf3;
+            --muted: #8b9cb3;
+            --accent: #2dd4bf;
+            --accent-dim: rgba(45, 212, 191, 0.14);
+            --accent-2: #fb923c;
+            --border: rgba(230, 237, 243, 0.09);
+            --shadow: 0 24px 80px rgba(0, 0, 0, 0.45);
+            --surface-0: #0d1117;
+            --surface-1: #161b22;
+            --surface-input: #21262d;
+        }
+        .stApp {
+            background:
+                radial-gradient(circle at 12% 8%, rgba(45, 212, 191, 0.09), transparent 32%),
+                radial-gradient(circle at 88% 4%, rgba(251, 146, 60, 0.07), transparent 28%),
+                linear-gradient(180deg, #0d1117 0%, #0a0e14 100%);
+            color: var(--ink);
+            font-family: "Manrope", sans-serif;
+        }
+        [data-testid="stAppViewContainer"],
+        [data-testid="stHeader"] {
+            background: transparent;
+        }
+        [data-testid="stSidebar"] {
+            background: linear-gradient(180deg, var(--surface-1) 0%, #121820 100%);
+            border-right: 1px solid var(--border);
+        }
+        [data-testid="stSidebar"] .stMarkdown,
+        [data-testid="stSidebar"] label,
+        [data-testid="stSidebar"] span {
+            color: var(--ink) !important;
+        }
+        .block-container {
+            padding-top: 2.2rem;
+            padding-bottom: 2.2rem;
+            max-width: 1100px;
+        }
+        section.main [data-testid="stMarkdownContainer"] p,
+        section.main [data-testid="stMarkdownContainer"] li,
+        section.main label,
+        .stSubheader {
+            color: var(--ink) !important;
+        }
+        .stTextInput label,
+        .stTextArea label {
+            color: var(--muted) !important;
+        }
+        .stTextInput input,
+        .stTextArea textarea {
+            background-color: var(--surface-input) !important;
+            color: var(--ink) !important;
+            border: 1px solid var(--border) !important;
+            border-radius: 12px !important;
+        }
+        .stTextInput input:focus,
+        .stTextArea textarea:focus {
+            border-color: rgba(45, 212, 191, 0.45) !important;
+            box-shadow: 0 0 0 1px rgba(45, 212, 191, 0.25);
+        }
+        div[data-baseweb="select"] > div {
+            background-color: var(--surface-input) !important;
+            border-color: var(--border) !important;
+            color: var(--ink) !important;
+        }
+        .stButton > button {
+            background: linear-gradient(135deg, #0d9488 0%, #0f766e 100%) !important;
+            color: #f0fdfa !important;
+            border: none !important;
+            font-weight: 700 !important;
+            border-radius: 12px !important;
+        }
+        .stButton > button:hover {
+            background: linear-gradient(135deg, #14b8a6 0%, #0d9488 100%) !important;
+            color: #fff !important;
+        }
+        [data-testid="stExpander"] {
+            background: var(--paper);
+            border: 1px solid var(--border);
+            border-radius: 14px;
+        }
+        [data-testid="stExpander"] summary {
+            color: var(--ink) !important;
+        }
+        .stProgress > div > div {
+            background-color: rgba(45, 212, 191, 0.35) !important;
+        }
+        .stProgress > div > div > div {
+            background: linear-gradient(90deg, #2dd4bf, #14b8a6) !important;
+        }
+        .hero {
+            padding: 2rem 2.2rem;
+            border-radius: 28px;
+            background: linear-gradient(145deg, rgba(30, 36, 46, 0.95), rgba(22, 27, 34, 0.88));
+            border: 1px solid var(--border);
+            box-shadow: var(--shadow);
+            backdrop-filter: blur(12px);
+            margin-bottom: 1.2rem;
+        }
+        .hero-kicker {
+            font-size: 0.82rem;
+            text-transform: uppercase;
+            letter-spacing: 0.18em;
+            color: var(--accent);
+            font-weight: 800;
+            margin-bottom: 0.65rem;
+        }
+        .hero h1 {
+            font-size: clamp(2rem, 3.5vw, 3.7rem);
+            line-height: 0.98;
+            margin: 0;
+            max-width: 11ch;
+            color: var(--ink);
+        }
+        .hero p {
+            max-width: 56rem;
+            color: var(--muted);
+            font-size: 1.02rem;
+            line-height: 1.65;
+            margin-top: 0.95rem;
+            margin-bottom: 0;
+        }
+        .info-strip {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(190px, 1fr));
+            gap: 0.8rem;
+            margin: 1rem 0 1.25rem;
+        }
+        .info-card {
+            padding: 1rem 1.05rem;
+            border-radius: 20px;
+            background: var(--paper);
+            border: 1px solid var(--border);
+        }
+        .info-label {
+            color: var(--muted);
+            font-size: 0.84rem;
+            margin-bottom: 0.3rem;
+        }
+        .info-value {
+            font-weight: 700;
+            color: var(--ink);
+            word-break: break-word;
+        }
+        .result-card {
+            padding: 1rem 1.1rem 1.1rem;
+            border-radius: 22px;
+            background: var(--card);
+            border: 1px solid var(--border);
+            margin-bottom: 0.9rem;
+        }
+        .result-rank {
+            display: inline-block;
+            padding: 0.2rem 0.55rem;
+            margin-bottom: 0.65rem;
+            border-radius: 999px;
+            background: var(--accent-dim);
+            color: var(--accent);
+            font-size: 0.8rem;
+            font-weight: 800;
+            letter-spacing: 0.06em;
+            text-transform: uppercase;
+        }
+        .result-title {
+            font-size: 1.12rem;
+            font-weight: 800;
+            margin-bottom: 0.35rem;
+            color: var(--ink);
+        }
+        .result-score {
+            color: var(--accent-2);
+            font-family: "IBM Plex Mono", monospace;
+            font-size: 0.92rem;
+            margin-bottom: 0.75rem;
+        }
+        .caption-note {
+            color: var(--muted);
+            font-size: 0.92rem;
+        }
+        [data-testid="stSidebar"] pre,
+        [data-testid="stSidebar"] code {
+            background-color: var(--surface-input) !important;
+            color: #a5f3fc !important;
+            border: 1px solid var(--border) !important;
+            border-radius: 10px !important;
+        }
+        [data-testid="stSidebar"] [data-testid="stMarkdownContainer"] a {
+            color: var(--accent) !important;
+        }
+        </style>
+        """,
+        unsafe_allow_html=True,
+    )
+def render_hero() -> None:
+    st.markdown(
+        """
+        <section class="hero">
+            <div class="hero-kicker">Моисейин Андрей Денисович</div>
+            <h1>Классификатор научных статей</h1>
+            <p>
+               Вот не зря я учил веб разработку 4 года, чтобы писать на html, css и js. Эх, был бы реакт.
+            </p>
+        </section>
+        """,
+        unsafe_allow_html=True,
+    )
+    st.markdown(
+        f"""
+        <div class="info-strip">
+            <div class="info-card">
+                <div class="info-label">Базовая модель</div>
+                <div class="info-value">{BASE_MODEL_NAME}</div>
+            </div>
+            <div class="info-card">
+                <div class="info-label">Checkpoint</div>
+                <div class="info-value">{MODEL_DIR}</div>
+            </div>
+            <div class="info-card">
+                <div class="info-label">Макс. длина</div>
+                <div class="info-value">{MAX_LENGTH} токенов</div>
+            </div>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+def render_results(records: list[dict[str, float]]) -> None:
+    st.subheader("Ответ")
+    st.caption("Классы отсортированы по убыванию вероятности. Показаны только те, которые набрали 95%.")
+    for index, record in enumerate(records, start=1):
+        st.markdown(
+            f"""
+            <div class="result-card">
+                <div class="result-rank">#{index}</div>
+                <div class="result-title">{record["label"]}</div>
+                <div class="result-score">p = {record["score"]:.2%}</div>
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+        st.progress(min(max(record["score"], 0.0), 1.0))
+    st.caption(
+        f"Суммарная вероятность показанных тем: {sum(record['score'] for record in records):.2%}"
+    )
+def render_sidebar() -> None:
+    if "selected_preset" not in st.session_state:
+        st.session_state.selected_preset = "Свой текст"
+    if "article_title" not in st.session_state:
+        st.session_state.article_title = ""
+    if "article_abstract" not in st.session_state:
+        st.session_state.article_abstract = ""
+    st.sidebar.markdown("### Быстрый старт")
+    preset_name = st.sidebar.selectbox(
+        "Пример статьи",
+        options=["Свой текст"] + list(EXAMPLES.keys()),
+    )
+    if preset_name != st.session_state.selected_preset:
+        if preset_name == "Свой текст":
+            st.session_state.article_title = ""
+            st.session_state.article_abstract = ""
+        else:
+            st.session_state.article_title = EXAMPLES[preset_name]["title"]
+            st.session_state.article_abstract = EXAMPLES[preset_name]["abstract"]
+        st.session_state.selected_preset = preset_name
+def main() -> None:
+    st.set_page_config(
+        page_title="Article Topic Classifier",
+        layout="wide",
+    )
+    apply_styles()
+    render_hero()
+    render_sidebar()
+    left_col, right_col = st.columns([1.15, 0.85], gap="large")
+    with left_col:
+        with st.form("classifier-form", clear_on_submit=False):
+            title = st.text_input(
+                "Название статьи",
+                key="article_title",
+                placeholder="Например: Attention is all you need",
+            )
+            abstract = st.text_area(
+                "Абстракт",
+                key="article_abstract",
+                height=280,
+                placeholder="Вставьте абстракт статьи. Если не вставишь, ну и фиг с ним.",
+            )
+            submitted = st.form_submit_button("Крутить барабан (трансформер)", use_container_width=True)
+        st.markdown(
+            """
+            <div class="caption-note">
+                Если abstract пустой, классификация идёт только по названию. ОТВЕТ СНИЗУ.
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+    if not submitted:
+        return
+    with st.spinner("Кручу барабан (трансформер)..."):
+            results = predict_topics(title, abstract)
+    render_results(results)
+if __name__ == "__main__":
+    # как удобно
+    from streamlit.runtime.scriptrunner_utils.script_run_context import get_script_run_ctx
+    if get_script_run_ctx(suppress_warning=True) is None:
+        import subprocess
+        import sys
+        raise SystemExit(
+            subprocess.call(
+                [sys.executable, "-m", "streamlit", "run", Path(__file__).resolve().as_posix(), *sys.argv[1:]]
+            )
+        )
+    main()

paper_classifier.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from __future__ import annotations
+from typing import Iterable
+BASE_MODEL_NAME = "distilbert/distilbert-base-cased"
+DEFAULT_MODEL_DIR = "artifacts/distilbert-arxiv"
+MAX_LENGTH = 256
+TOP_P_THRESHOLD = 0.95
+EXPECTED_ARXIV_CATEGORIES = [
+    "Computer Science",
+    "Physics",
+    "Mathematics",
+    "Statistics",
+    "Quantitative Biology",
+    "Quantitative Finance",
+    "Economics",
+    "Electrical Engineering and Systems Science",
+]
+EXAMPLES = {
+    "Graph Neural Networks": {
+        "title": "Message Passing Neural Networks for Molecular Property Prediction",
+        "abstract": (
+            "We introduce a graph-based neural architecture for supervised learning on "
+            "molecular graphs. The model propagates messages between atoms, aggregates "
+            "node states into a graph embedding, and predicts physical and chemical "
+            "properties with competitive accuracy."
+        ),
+    },
+    "Physics": {
+        "title": "Topological phase transitions in two-dimensional quantum materials",
+        "abstract": (
+            "We study a lattice model with strong spin-orbit coupling and show how "
+            "interactions modify the phase diagram. Using numerical simulations we "
+            "characterize edge states, quantify transport signatures, and discuss "
+            "observable consequences for low-temperature experiments."
+        ),
+    },
+    "Bioinformatics": {
+        "title": "Transformer models for protein function annotation from sequence",
+        "abstract": (
+            "We pretrain a transformer encoder on amino acid sequences and finetune it "
+            "for protein function prediction. The approach improves annotation quality "
+            "for underrepresented families and reveals biologically meaningful sequence "
+            "patterns."
+        ),
+    },
+}
+def format_input_text(title: str, abstract: str) -> str:
+    title = title.strip()
+    abstract = abstract.strip()
+    parts: list[str] = []
+    if title:
+        parts.append(f"Title: {title}\nTitle summary: {title}")
+    if abstract:
+        parts.append(f"Abstract: {abstract}")
+    return "\n\n".join(parts)
+def take_top_p(records: Iterable[dict[str, float]], threshold: float) -> list[dict[str, float]]:
+    selected: list[dict[str, float]] = []
+    cumulative = 0.0
+    for record in records:
+        selected.append(record)
+        cumulative += record["score"]
+        if cumulative >= threshold:
+            break
+    return selected

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
-pandas
-streamlit

+streamlit
+transformers
+torch
+safetensors
+datasets
+accelerate
+scikit-learn

train_distilbert.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from __future__ import annotations
+import ast
+import json
+from collections import Counter
+from functools import partial
+from pathlib import Path
+import numpy as np
+from datasets import Dataset, DatasetDict
+from sklearn.metrics import accuracy_score, f1_score
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from paper_classifier import BASE_MODEL_NAME, DEFAULT_MODEL_DIR, MAX_LENGTH, format_input_text
+DATA_PATH = Path("arxivData.json")
+OUTPUT_DIR = Path(DEFAULT_MODEL_DIR)
+HF_CACHE_DIR = Path("/tmp/huggingface")
+TITLE_FIELD = "title"
+ABSTRACT_FIELD = "summary"
+TAG_FIELD = "tag"
+VALIDATION_SIZE = 0.1
+NUM_TRAIN_EPOCHS = 4
+LEARNING_RATE = 2e-5
+WEIGHT_DECAY = 0.01
+PER_DEVICE_TRAIN_BATCH_SIZE = 16
+PER_DEVICE_EVAL_BATCH_SIZE = 32
+LOGGING_STEPS = 50
+SEED = 42
+PREFIX_TO_LABEL = {
+    "adap-org": "Quantitative Biology",
+    "astro-ph": "Physics",
+    "cmp-lg": "Computer Science",
+    "cond-mat": "Physics",
+    "cs": "Computer Science",
+    "econ": "Economics",
+    "eess": "Electrical Engineering and Systems Science",
+    "gr-qc": "Physics",
+    "hep-ex": "Physics",
+    "hep-lat": "Physics",
+    "hep-ph": "Physics",
+    "hep-th": "Physics",
+    "math": "Mathematics",
+    "nlin": "Physics",
+    "nucl-th": "Physics",
+    "physics": "Physics",
+    "q-bio": "Quantitative Biology",
+    "q-fin": "Quantitative Finance",
+    "quant-ph": "Physics",
+    "stat": "Statistics",
+}
+def normalize_text(value):
+    return " ".join(str(value or "").split())
+def parse_top_level_label(raw_tag):
+    if not raw_tag:
+        return None
+    try:
+        parsed_tags = ast.literal_eval(str(raw_tag))
+    except (SyntaxError, ValueError):
+        return None
+    if not isinstance(parsed_tags, list):
+        return None
+    for tag in parsed_tags:
+        if not isinstance(tag, dict):
+            continue
+        term = tag.get("term")
+        if not term:
+            continue
+        prefix = str(term).split(".")[0]
+        label = PREFIX_TO_LABEL.get(prefix)
+        if label:
+            return label
+    return None
+def build_records():
+    with DATA_PATH.open("r", encoding="utf-8") as file:
+        raw_records = json.load(file)
+    prepared_records: list[dict[str, str]] = []
+    skipped = Counter()
+    for item in raw_records:
+        title = normalize_text(item.get(TITLE_FIELD))
+        abstract = normalize_text(item.get(ABSTRACT_FIELD))
+        label = parse_top_level_label(item.get(TAG_FIELD))
+        text = format_input_text(title, abstract)
+        prepared_records.append(
+            {
+                "text": text,
+                "label": label,
+            }
+        )
+    print(f"Loaded {len(prepared_records)}")
+    label_distribution = Counter(record["label"] for record in prepared_records)
+    print("Label distribution:", dict(label_distribution))
+    return prepared_records
+def build_splits(records):
+    dataset = Dataset.from_list(records)
+    split = dataset.train_test_split(test_size=VALIDATION_SIZE, seed=SEED)
+    return DatasetDict(train=split["train"], validation=split["test"])
+def preprocess(batch, *, tokenizer, label2id):
+    tokenized = tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)
+    tokenized["labels"] = [label2id[label] for label in batch["label"]]
+    return tokenized
+def compute_metrics(eval_prediction):
+    logits, labels = eval_prediction
+    predictions = np.argmax(logits, axis=-1)
+    return {
+        "accuracy": accuracy_score(labels, predictions),
+        "macro_f1": f1_score(labels, predictions, average="macro"),
+    }
+def main() -> None:
+    if not DATA_PATH.exists():
+        raise FileNotFoundError(f"Dataset file not found: {DATA_PATH}")
+    set_seed(SEED)
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    HF_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    records = build_records()
+    raw_splits = build_splits(records)
+    label_names = sorted({record["label"] for record in records})
+    label2id = {label: index for index, label in enumerate(label_names)}
+    id2label = {index: label for label, index in label2id.items()}
+    tokenizer = AutoTokenizer.from_pretrained(
+        BASE_MODEL_NAME,
+        cache_dir=HF_CACHE_DIR.as_posix(),
+    )
+    tokenized_splits = raw_splits.map(
+        partial(preprocess, tokenizer=tokenizer, label2id=label2id),
+        batched=True,
+        remove_columns=raw_splits["train"].column_names,
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        BASE_MODEL_NAME,
+        cache_dir=HF_CACHE_DIR.as_posix(),
+        num_labels=len(label_names),
+        id2label=id2label,
+        label2id=label2id,
+    )
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR.as_posix(),
+        do_train=True,
+        do_eval=True,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        logging_strategy="steps",
+        logging_steps=LOGGING_STEPS,
+        learning_rate=LEARNING_RATE,
+        weight_decay=WEIGHT_DECAY,
+        num_train_epochs=NUM_TRAIN_EPOCHS,
+        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
+        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
+        load_best_model_at_end=True,
+        metric_for_best_model="macro_f1",
+        greater_is_better=True,
+        save_total_limit=2,
+        report_to=[],
+        seed=SEED,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_splits["train"],
+        eval_dataset=tokenized_splits["validation"],
+        processing_class=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+    trainer.train()
+    metrics = trainer.evaluate()
+    trainer.save_model(OUTPUT_DIR.as_posix())
+    tokenizer.save_pretrained(OUTPUT_DIR.as_posix())
+    summary_path = OUTPUT_DIR / "training_summary.json"
+    summary = {
+        "base_model": BASE_MODEL_NAME,
+        "data_path": DATA_PATH.as_posix(),
+        "output_dir": OUTPUT_DIR.as_posix(),
+        "title_field": TITLE_FIELD,
+        "abstract_field": ABSTRACT_FIELD,
+        "tag_field": TAG_FIELD,
+        "labels": label_names,
+        "metrics": metrics,
+    }
+    summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(json.dumps(summary, indent=2))
+if __name__ == "__main__":
+    main()