import os
import sys
import json
import time
import pandas as pd
import numpy as np
import streamlit as st
_ROOT = os.path.dirname(os.path.abspath(__file__))
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
# ── Page config ──────────────────────────────────────────────────────────────
st.set_page_config(
page_title="TruthLens · Fake News Detector",
page_icon="🔍",
layout="wide",
initial_sidebar_state="collapsed",
)
# ── Global CSS ───────────────────────────────────────────────────────────────
st.markdown("""
""", unsafe_allow_html=True)
# ── Cached inference loader ──────────────────────────────────────────────────
@st.cache_resource(show_spinner=False)
def load_pipeline():
from src.stage4_inference import predict_article, ModelNotTrainedError
return predict_article, ModelNotTrainedError
# ── Session state ────────────────────────────────────────────────────────────
for k, v in [("analyzed", False), ("last_result", None), ("last_input", "")]:
if k not in st.session_state:
st.session_state[k] = v
# =============================================================================
# LANDING PAGE (shown before any analysis)
# =============================================================================
if not st.session_state["analyzed"]:
# ── Hero section ──
st.markdown("""
🔍
TruthLens
Paste any news article or drop a URL — our AI will tell you
if it's real, fake, or outdated in seconds.
""", unsafe_allow_html=True)
# ── How it works ──
st.markdown("""
📋
Paste or Link
Drop in the article text or a URL. We'll extract everything automatically.
⚡
Instant Analysis
Our AI analyzes language patterns, checks freshness, and searches live sources.
✅
Get Your Verdict
See a clear REAL / FAKE / OUTDATED verdict with a confidence score and explanation.
""", unsafe_allow_html=True)
# ── Input area ──
input_tab = st.radio("How would you like to provide the article?",
["✍️ Write or paste text", "🔗 Paste a URL"],
horizontal=True, label_visibility="visible")
input_text, input_title, input_url, input_date, input_domain = "", "", "", "", ""
if input_tab == "✍️ Write or paste text":
input_title = st.text_input("Headline (optional)",
placeholder="e.g. Breaking: Scientists discover high-speed interstellar travel")
input_text = st.text_area("Article content",
height=180,
placeholder="Paste the full article body here…")
# ── Auto-extract title from pasted text if headline field is empty ──
if not input_title.strip() and input_text.strip():
if input_text.lower().startswith("title:"):
lines = input_text.split("\n", 1)
input_title = lines[0].replace("Title:", "").replace("title:", "").strip()
input_text = lines[1].replace("Body:", "").replace("body:", "").strip() if len(lines) > 1 else ""
else:
# Fallback: first sentence is title
input_title = input_text.split(".")[0].strip()
else:
input_url = st.text_input("Article URL",
placeholder="https://www.example.com/news/breaking-story")
st.caption("We'll automatically extract the title, body, and publish date.")
# ── Analysis mode (kept minimal — user doesn't need to understand internals)
speed = st.select_slider("Analysis depth",
options=["Quick", "Standard", "Deep"],
value="Deep",
help="Quick ≈ 2 sec · Standard ≈ 10 sec · Deep ≈ 30 sec (most accurate)")
speed_map = {"Quick": "fast", "Standard": "balanced", "Deep": "full"}
selected_mode = speed_map[speed]
# ── Predict button ──
predict_clicked = st.button("🔍 Check this article", use_container_width=True, type="primary")
# ── Verdict legend ──
st.markdown("""
🟢 Verified True
🔴 Likely Fake
🟡 Outdated
🟠 Needs Review
""", unsafe_allow_html=True)
# ── Execute prediction ──
if predict_clicked:
# Validate
if input_tab == "✍️ Write or paste text":
if not input_text or len(input_text.split()) < 10:
st.warning("⚠️ Please paste at least a few sentences so we can analyze it properly.")
st.stop()
else:
if not input_url:
st.warning("⚠️ Please enter a URL first.")
st.stop()
try:
import newspaper
from urllib.parse import urlparse
art = newspaper.Article(input_url)
art.download()
art.parse()
input_title = art.title or ""
input_text = art.text or ""
input_date = art.publish_date.isoformat() if art.publish_date else ""
input_domain = urlparse(input_url).netloc
if len(input_text.split()) < 10:
st.warning("⚠️ Couldn't extract enough text from that URL. Try pasting the article directly.")
st.stop()
except Exception:
st.error("❌ Couldn't fetch that URL. Please check the link or paste the text directly.")
st.stop()
predict_article, ModelNotTrainedError = load_pipeline()
with st.status("🔍 Analyzing article…", expanded=True) as status:
st.write("📖 Reading article…")
time.sleep(0.3)
st.write("🧠 Running AI analysis…")
try:
result = predict_article(
title=input_title,
text=input_text,
source_domain=input_domain,
published_date=input_date,
mode=selected_mode,
)
st.write("🕐 Checking article freshness…")
st.write("🌐 Searching live sources…")
status.update(label="✅ Done!", state="complete")
st.session_state["last_result"] = result
st.session_state["last_input"] = input_text
st.session_state["analyzed"] = True
st.rerun()
except ModelNotTrainedError:
status.update(label="❌ Setup required", state="error")
st.error("The AI models haven't been trained yet.")
st.info("Ask your administrator to run: `python run_pipeline.py --stage 1 2 3`")
st.stop()
except Exception as e:
status.update(label="❌ Error", state="error")
st.error(f"Something went wrong: {e}")
st.stop()
# =============================================================================
# RESULTS PAGE (shown after analysis)
# =============================================================================
else:
res = st.session_state["last_result"]
verdict = res.get("verdict", "UNKNOWN")
final_score = res.get("final_score", 0.0)
scores = res.get("scores", {})
confidence = res.get("confidence", "MEDIUM")
action = res.get("recommended_action", "Flag for review")
top_reasons = res.get("top_reasons", [])
missing_signals = res.get("missing_signals", [])
adv_flags = res.get("adversarial_flags", [])
wc = res.get("word_count", 0)
probas = res.get("base_model_probas", {})
votes = res.get("base_model_votes", {})
fresh_case = res.get("freshness_case", "B")
fresh_signals = res.get("freshness_signals_found", [])
deductions = res.get("deductions_applied", [])
entities = res.get("entities_found", [])
# ── Map verdict to display ──
V = {
"TRUE": {"bg":"#f0fdf4", "bdr":"#86efac", "icon":"🟢", "label":"This appears to be true", "color":"#15803d",
"explain":"Source, claims, language, and AI models all align with credible journalism."},
"UNCERTAIN": {"bg":"#fff7ed", "bdr":"#fdba74", "icon":"🟠", "label":"Uncertain — needs review", "color":"#c2410c",
"explain":"Mixed signals detected. We recommend verifying the sources yourself before sharing."},
"LIKELY FALSE": {"bg":"#fef2f2", "bdr":"#fca5a5", "icon":"🔴", "label":"Likely false", "color":"#b91c1c",
"explain":"Multiple signals indicate this content may be fabricated or misleading."},
"FALSE": {"bg":"#fef2f2", "bdr":"#fca5a5", "icon":"⛔", "label":"This looks fake", "color":"#991b1b",
"explain":"Strong evidence of misinformation. Do not share without independent verification."},
}
vc = V.get(verdict, {"bg":"#f8fafc","bdr":"#cbd5e1","icon":"⚪","label":verdict,"color":"#475569",
"explain":"Analysis complete."})
# ── Verdict banner ──
score_pct = final_score * 100
st.markdown(f"""
{vc['icon']}
{vc['label']}
Score: {score_pct:.0f}% · Confidence: {confidence}
{vc['explain']}
""", unsafe_allow_html=True)
# ── Recommended action badge ──
action_colors = {
"Publish": ("#f0fdf4", "#15803d"),
"Flag for review": ("#fff7ed", "#c2410c"),
"Suppress": ("#fef2f2", "#b91c1c"),
"Escalate": ("#fef2f2", "#991b1b"),
}
abg, acol = action_colors.get(action, ("#f8fafc", "#475569"))
st.markdown(f"""
Recommended: {action}
""", unsafe_allow_html=True)
# ── Tabs ──
tab_why, tab_fresh, tab_sources, tab_details = st.tabs(
["🧠 Why this verdict?", "🕐 Freshness", "🌐 Live sources", "📋 Details"]
)
# ── TAB 1: Why this verdict ──────────────────────────────────────────
with tab_why:
# ── 5-Signal Score Breakdown ──
st.markdown("#### Signal Breakdown")
SIGNAL_INFO = [
("Source", "source", "Is the outlet known and accountable?"),
("Claims", "claim", "Are facts verifiable with named entities?"),
("Language", "linguistic", "Is the writing neutral and attributed?"),
("Freshness", "freshness", "How recent is the content?"),
("AI Models", "model_vote", "What do the AI models think?"),
]
WEIGHTS = {"source": "30%", "claim": "30%", "linguistic": "20%", "freshness": "10%", "model_vote": "10%"}
cols = st.columns(5)
for i, (label, key, desc) in enumerate(SIGNAL_INFO):
val = scores.get(key, 0.0)
pct = val * 100
if pct >= 70:
col_hex = "#15803d"
elif pct >= 50:
col_hex = "#ca8a04"
else:
col_hex = "#b91c1c"
with cols[i]:
st.markdown(f"""
{pct:.0f}%
{label}
Weight: {WEIGHTS[key]}
""", unsafe_allow_html=True)
st.markdown("")
# ── Progress bars for each signal ──
for label, key, desc in SIGNAL_INFO:
val = scores.get(key, 0.0)
st.caption(f"**{label}** — {desc}")
st.progress(min(val, 1.0))
st.markdown("---")
# ── Top Reasons ──
if top_reasons:
st.markdown("#### Key Factors")
for r in top_reasons:
if any(neg in r.lower() for neg in ["fake", "false", "unknown", "not", "manipulation", "adversarial", "sensationalism", "reduces", "could not", "inconsistent", "missing"]):
st.markdown(f"🔴 {r}")
else:
st.markdown(f"🟢 {r}")
st.markdown("---")
# ── What did each AI model think? ──
st.markdown("#### AI Model Votes")
MODEL_NAMES = [
("Statistical", "logistic", "lr_proba"),
("Language", "lstm", "lstm_proba"),
("Deep A", "distilbert", "distilbert_proba"),
("Deep B", "roberta", "roberta_proba"),
]
mcols = st.columns(len(MODEL_NAMES))
for i, (nice_name, vote_key, pk) in enumerate(MODEL_NAMES):
vote_val = votes.get(vote_key)
prob_val = probas.get(pk)
with mcols[i]:
if vote_val is None or prob_val is None or np.isnan(prob_val):
st.metric(nice_name, "Skipped")
else:
lbl = "Real" if int(vote_val) == 1 else "Fake"
st.metric(nice_name, lbl, f"{prob_val*100:.0f}%")
if res.get("short_text_warning"):
st.warning("⚠️ Short article (under 50 words) — confidence is dampened.")
st.caption(f"Article length: {wc} words")
# ── TAB 2: Freshness ─────────────────────────────────────────────────
with tab_fresh:
fresh_val = scores.get("freshness", 0.5)
bar_pct = int(fresh_val * 100)
if fresh_val >= 0.70:
fbg, flbl, fdesc = "#f0fdf4", "🟢 Fresh", "This article appears to be recent."
fbar = "#16a34a"
elif fresh_val >= 0.40:
fbg, flbl, fdesc = "#fefce8", "🟡 Moderate", "Article may not be very recent."
fbar = "#ca8a04"
else:
fbg, flbl, fdesc = "#fef2f2", "🔴 Outdated", "This article appears to be old."
fbar = "#dc2626"
st.markdown(f"""
{flbl}
{fdesc}
Freshness: {fresh_val:.0%}
""", unsafe_allow_html=True)
# Case indicator
case_label = "📅 Date-based scoring" if fresh_case == "A" else "🔎 Contextual signal scanning (no date found)"
st.markdown(f"""
Method: {case_label}
""", unsafe_allow_html=True)
# Signals found (Case B)
if fresh_case == "B" and fresh_signals:
st.markdown("**Signals detected:**")
for sig in fresh_signals:
st.markdown(f"✅ {sig}")
elif fresh_case == "B":
st.caption("No contextual freshness signals were found in the article text.")
# ── TAB 3: Live sources ──────────────────────────────────────────────
with tab_sources:
rag_data = res.get("rag_results")
source_list = []
if isinstance(rag_data, dict):
source_list = rag_data.get("data", [])
elif isinstance(rag_data, list):
source_list = rag_data
if not source_list:
st.markdown("""
Live source check was not triggered
Live source verification runs when freshness is ambiguous.
This analysis relied on the 5-signal scoring framework instead.
""", unsafe_allow_html=True)
else:
st.caption(f"Compared against {len(source_list)} live web results.")
for item in source_list:
snippet = item.get("snippet", "")
sim = item.get("similarity", 0.0)
if sim > 0.65:
sc_col, sc_tag = "#16a34a", "Supports"
elif sim < 0.30:
sc_col, sc_tag = "#dc2626", "Conflicts"
else:
sc_col, sc_tag = "#ca8a04", "Neutral"
st.markdown(f"""
""", unsafe_allow_html=True)
# ── TAB 4: Details ───────────────────────────────────────────────────
with tab_details:
# ── Missing Signals ──
if missing_signals:
st.markdown("#### ⚠️ Missing Signals")
for ms in missing_signals:
st.markdown(f"- {ms}")
st.markdown("")
# ── Adversarial Flags ──
if adv_flags:
st.markdown("#### 🚩 Adversarial Flags Triggered")
for af in adv_flags:
st.error(f"🚩 {af}")
st.caption("Adversarial flags cap the final score at 25% maximum.")
st.markdown("")
# ── Linguistic Deductions ──
if deductions:
st.markdown("#### 📝 Linguistic Deductions")
for d in deductions:
st.markdown(f"- {d}")
st.markdown("")
# ── Named Entities Found ──
if entities:
st.markdown("#### 🏷️ Entities Detected")
st.markdown(", ".join([f"`{e}`" for e in entities]))
q_attr = res.get("quotes_attributed", 0)
q_total = res.get("quotes_total", 0)
if q_total > 0:
st.caption(f"Quotes: {q_attr}/{q_total} attributed")
st.markdown("")
# ── Summary Table ──
st.markdown("#### Analysis Summary")
rows = [
("Verdict", vc["label"]),
("Final Score", f"{score_pct:.1f}%"),
("Confidence", confidence),
("Action", action),
("Word Count", str(wc)),
("Freshness", f"{scores.get('freshness', 0):.0%} (Case {fresh_case})"),
]
df_rep = pd.DataFrame(rows, columns=["Field", "Value"])
st.dataframe(df_rep, use_container_width=True, hide_index=True, height=240)
with st.expander("🔧 Raw JSON (for developers)"):
st.code(json.dumps(res, indent=2, default=str), language="json")
# ── Analyze another ──
st.markdown("---")
if st.button("← Analyze another article", use_container_width=True):
st.session_state["analyzed"] = False
st.session_state["last_result"] = None
st.rerun()