Spaces:

sastsy
/

paper_labeling

Running

App Files Files Community

paper_labeling / src /streamlit_app.py

sastsy

Update src/streamlit_app.py

68b3862 verified 11 days ago

raw

history blame contribute delete

4.46 kB

	import json
	import re
	from pathlib import Path
	import html
	import unicodedata
	import numpy as np
	import streamlit as st
	import torch
	from transformers import AutoTokenizer, T5ForSequenceClassification

	BASE_DIR = Path(__file__).parent
	MODEL_DIR = BASE_DIR / "best_model"
	LABEL_ENCODER = BASE_DIR / "label_encoder.json"
	MAX_LENGTH = 256
	TOP_PROB = 0.95
	DEVICE = torch.device("cpu")
	MIN_CHARS = 20
	MAX_CHARS = 5000

	def clean_text(text: str) -> str:
	text = html.unescape(text)
	text = unicodedata.normalize("NFKC", text)
	text = re.sub(r"\$.*?\$", "", text)
	text = re.sub(r"\\[a-zA-Z]+\{.*?\}", "", text)
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	def validate(title: str, abstract: str):
	if len(title.strip()) < MIN_CHARS:
	return f"Title too short (at least {MIN_CHARS} characters)"
	if len(abstract.strip()) < MIN_CHARS:
	return f"Abstract too short (at least {MIN_CHARS} characters)"
	if len(title) > MAX_CHARS:
	return f"Title too long (max {MAX_CHARS} characters)"
	if len(abstract) > MAX_CHARS:
	return f"Abstract too long (max {MAX_CHARS} characters)"
	return None


	@st.cache_resource(show_spinner="Loading model…")
	def load_model():
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
	model = T5ForSequenceClassification.from_pretrained(MODEL_DIR)
	model.to(DEVICE)
	model.eval()
	with open(LABEL_ENCODER) as f:
	id2label = {int(k): v for k, v in json.load(f).items()}
	return tokenizer, model, id2label
	except FileNotFoundError as e:
	st.error(f"Model files not found: {e}")
	st.stop()
	except Exception as e:
	st.error(f"Failed to load model: {e}")
	st.stop()


	@st.cache_data(show_spinner=False)
	def predict(title: str, abstract: str):
	tokenizer, model, id2label = load_model()
	if title and abstract:
	text = f"classify: {clean_text(title)} [SEP] {clean_text(abstract)[:1000]}"
	elif title:
	text = f"classify: {clean_text(title)}"
	else:
	text = f"classify: {clean_text(abstract)[:1000]}"
	eos_id = tokenizer.eos_token_id

	enc = tokenizer(text, max_length=MAX_LENGTH, truncation=True, return_tensors="pt")
	ids = enc["input_ids"][0].tolist()

	if eos_id is not None and ids.count(eos_id) > 1:
	ids = [t for t in ids[:-1] if t != eos_id] + [eos_id]

	input_ids = torch.tensor([ids], dtype=torch.long).to(DEVICE)
	attention_mask = torch.ones_like(input_ids)

	try:
	with torch.no_grad():
	logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
	except Exception as e:
	st.error(f"Model inference failed: {e}")
	st.stop()

	probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
	sorted_idx = np.argsort(probs)[::-1]
	cumsum = np.cumsum(probs[sorted_idx])

	return [
	{"label": id2label.get(int(i), f"class_{i}"), "probability": float(probs[i])}
	for i in sorted_idx[:int(np.searchsorted(cumsum, TOP_PROB)) + 1]
	]


	st.set_page_config(page_title="Paper Classifier", page_icon="📄", layout="centered")
	st.title("📄 Research Paper Classifier")
	st.caption("Predicts arXiv categories from a paper's title and abstract.")

	load_model()

	title = st.text_input("Title *", placeholder="e.g. Attention Is All You Need", max_chars=MAX_CHARS)
	abstract = st.text_area("Abstract *", placeholder="Paste the abstract here…", height=200, max_chars=MAX_CHARS)

	col1, col2 = st.columns([1, 4])
	classify_btn = col1.button("Classify", type="primary", use_container_width=True)
	col2.button("Clear", on_click=lambda: None, use_container_width=True)

	if classify_btn:
	error = validate(title, abstract)
	if error:
	st.warning(error)
	if not clean_text(title) and not clean_text(abstract):
	st.warning("Please enter at least a title or an abstract.")
	else:
	with st.spinner("Classifying…"):
	results = predict(title, abstract)

	st.divider()
	st.subheader("Predicted categories")
	st.caption(
	f"Top-95% probability set ({len(results)} labels) "
	f"combined probability {sum(r['probability'] for r in results):.1%}"
	)

	for r in results:
	col1, col2 = st.columns([3, 1])
	col1.markdown(f"{r['label']}")
	col2.markdown(f"`{r['probability']:.1%}`")
	st.progress(float(r["probability"]))