Spaces:

AndreyForty
/

SHAD_Homework

Sleeping

App Files Files Community

AndreyForty commited on 12 days ago

Commit

af53d97

verified ·

1 Parent(s): 2f856a9

Upload 2 files

Browse files

Files changed (2) hide show

src/paper_classifier.py +73 -0
src/train_distilbert.py +229 -0

src/paper_classifier.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from __future__ import annotations
+from typing import Iterable
+BASE_MODEL_NAME = "distilbert/distilbert-base-cased"
+DEFAULT_MODEL_DIR = "artifacts/distilbert-arxiv"
+MAX_LENGTH = 256
+TOP_P_THRESHOLD = 0.95
+EXPECTED_ARXIV_CATEGORIES = [
+    "Computer Science",
+    "Physics",
+    "Mathematics",
+    "Statistics",
+    "Quantitative Biology",
+    "Quantitative Finance",
+    "Economics",
+    "Electrical Engineering and Systems Science",
+]
+EXAMPLES = {
+    "Graph Neural Networks": {
+        "title": "Message Passing Neural Networks for Molecular Property Prediction",
+        "abstract": (
+            "We introduce a graph-based neural architecture for supervised learning on "
+            "molecular graphs. The model propagates messages between atoms, aggregates "
+            "node states into a graph embedding, and predicts physical and chemical "
+            "properties with competitive accuracy."
+        ),
+    },
+    "Physics": {
+        "title": "Topological phase transitions in two-dimensional quantum materials",
+        "abstract": (
+            "We study a lattice model with strong spin-orbit coupling and show how "
+            "interactions modify the phase diagram. Using numerical simulations we "
+            "characterize edge states, quantify transport signatures, and discuss "
+            "observable consequences for low-temperature experiments."
+        ),
+    },
+    "Bioinformatics": {
+        "title": "Transformer models for protein function annotation from sequence",
+        "abstract": (
+            "We pretrain a transformer encoder on amino acid sequences and finetune it "
+            "for protein function prediction. The approach improves annotation quality "
+            "for underrepresented families and reveals biologically meaningful sequence "
+            "patterns."
+        ),
+    },
+}
+def format_input_text(title: str, abstract: str) -> str:
+    title = title.strip()
+    abstract = abstract.strip()
+    parts: list[str] = []
+    if title:
+        parts.append(f"Title: {title}\nTitle summary: {title}")
+    if abstract:
+        parts.append(f"Abstract: {abstract}")
+    return "\n\n".join(parts)
+def take_top_p(records: Iterable[dict[str, float]], threshold: float) -> list[dict[str, float]]:
+    selected: list[dict[str, float]] = []
+    cumulative = 0.0
+    for record in records:
+        selected.append(record)
+        cumulative += record["score"]
+        if cumulative >= threshold:
+            break
+    return selected

src/train_distilbert.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from __future__ import annotations
+import ast
+import json
+from collections import Counter
+from functools import partial
+from pathlib import Path
+import numpy as np
+from datasets import Dataset, DatasetDict
+from sklearn.metrics import accuracy_score, f1_score
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from paper_classifier import BASE_MODEL_NAME, DEFAULT_MODEL_DIR, MAX_LENGTH, format_input_text
+DATA_PATH = Path("arxivData.json")
+OUTPUT_DIR = Path(DEFAULT_MODEL_DIR)
+HF_CACHE_DIR = Path("/tmp/huggingface")
+TITLE_FIELD = "title"
+ABSTRACT_FIELD = "summary"
+TAG_FIELD = "tag"
+VALIDATION_SIZE = 0.1
+NUM_TRAIN_EPOCHS = 4
+LEARNING_RATE = 2e-5
+WEIGHT_DECAY = 0.01
+PER_DEVICE_TRAIN_BATCH_SIZE = 16
+PER_DEVICE_EVAL_BATCH_SIZE = 32
+LOGGING_STEPS = 50
+SEED = 42
+PREFIX_TO_LABEL = {
+    "adap-org": "Quantitative Biology",
+    "astro-ph": "Physics",
+    "cmp-lg": "Computer Science",
+    "cond-mat": "Physics",
+    "cs": "Computer Science",
+    "econ": "Economics",
+    "eess": "Electrical Engineering and Systems Science",
+    "gr-qc": "Physics",
+    "hep-ex": "Physics",
+    "hep-lat": "Physics",
+    "hep-ph": "Physics",
+    "hep-th": "Physics",
+    "math": "Mathematics",
+    "nlin": "Physics",
+    "nucl-th": "Physics",
+    "physics": "Physics",
+    "q-bio": "Quantitative Biology",
+    "q-fin": "Quantitative Finance",
+    "quant-ph": "Physics",
+    "stat": "Statistics",
+}
+def normalize_text(value):
+    return " ".join(str(value or "").split())
+def parse_top_level_label(raw_tag):
+    if not raw_tag:
+        return None
+    try:
+        parsed_tags = ast.literal_eval(str(raw_tag))
+    except (SyntaxError, ValueError):
+        return None
+    if not isinstance(parsed_tags, list):
+        return None
+    for tag in parsed_tags:
+        if not isinstance(tag, dict):
+            continue
+        term = tag.get("term")
+        if not term:
+            continue
+        prefix = str(term).split(".")[0]
+        label = PREFIX_TO_LABEL.get(prefix)
+        if label:
+            return label
+    return None
+def build_records():
+    with DATA_PATH.open("r", encoding="utf-8") as file:
+        raw_records = json.load(file)
+    prepared_records: list[dict[str, str]] = []
+    skipped = Counter()
+    for item in raw_records:
+        title = normalize_text(item.get(TITLE_FIELD))
+        abstract = normalize_text(item.get(ABSTRACT_FIELD))
+        label = parse_top_level_label(item.get(TAG_FIELD))
+        text = format_input_text(title, abstract)
+        prepared_records.append(
+            {
+                "text": text,
+                "label": label,
+            }
+        )
+    print(f"Loaded {len(prepared_records)}")
+    label_distribution = Counter(record["label"] for record in prepared_records)
+    print("Label distribution:", dict(label_distribution))
+    return prepared_records
+def build_splits(records):
+    dataset = Dataset.from_list(records)
+    split = dataset.train_test_split(test_size=VALIDATION_SIZE, seed=SEED)
+    return DatasetDict(train=split["train"], validation=split["test"])
+def preprocess(batch, *, tokenizer, label2id):
+    tokenized = tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)
+    tokenized["labels"] = [label2id[label] for label in batch["label"]]
+    return tokenized
+def compute_metrics(eval_prediction):
+    logits, labels = eval_prediction
+    predictions = np.argmax(logits, axis=-1)
+    return {
+        "accuracy": accuracy_score(labels, predictions),
+        "macro_f1": f1_score(labels, predictions, average="macro"),
+    }
+def main() -> None:
+    if not DATA_PATH.exists():
+        raise FileNotFoundError(f"Dataset file not found: {DATA_PATH}")
+    set_seed(SEED)
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    HF_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    records = build_records()
+    raw_splits = build_splits(records)
+    label_names = sorted({record["label"] for record in records})
+    label2id = {label: index for index, label in enumerate(label_names)}
+    id2label = {index: label for label, index in label2id.items()}
+    tokenizer = AutoTokenizer.from_pretrained(
+        BASE_MODEL_NAME,
+        cache_dir=HF_CACHE_DIR.as_posix(),
+    )
+    tokenized_splits = raw_splits.map(
+        partial(preprocess, tokenizer=tokenizer, label2id=label2id),
+        batched=True,
+        remove_columns=raw_splits["train"].column_names,
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        BASE_MODEL_NAME,
+        cache_dir=HF_CACHE_DIR.as_posix(),
+        num_labels=len(label_names),
+        id2label=id2label,
+        label2id=label2id,
+    )
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR.as_posix(),
+        do_train=True,
+        do_eval=True,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        logging_strategy="steps",
+        logging_steps=LOGGING_STEPS,
+        learning_rate=LEARNING_RATE,
+        weight_decay=WEIGHT_DECAY,
+        num_train_epochs=NUM_TRAIN_EPOCHS,
+        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
+        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
+        load_best_model_at_end=True,
+        metric_for_best_model="macro_f1",
+        greater_is_better=True,
+        save_total_limit=2,
+        report_to=[],
+        seed=SEED,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_splits["train"],
+        eval_dataset=tokenized_splits["validation"],
+        processing_class=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+    trainer.train()
+    metrics = trainer.evaluate()
+    trainer.save_model(OUTPUT_DIR.as_posix())
+    tokenizer.save_pretrained(OUTPUT_DIR.as_posix())
+    summary_path = OUTPUT_DIR / "training_summary.json"
+    summary = {
+        "base_model": BASE_MODEL_NAME,
+        "data_path": DATA_PATH.as_posix(),
+        "output_dir": OUTPUT_DIR.as_posix(),
+        "title_field": TITLE_FIELD,
+        "abstract_field": ABSTRACT_FIELD,
+        "tag_field": TAG_FIELD,
+        "labels": label_names,
+        "metrics": metrics,
+    }
+    summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(json.dumps(summary, indent=2))
+if __name__ == "__main__":
+    main()