Spaces:

ecaplan
/

splits

Running

splits / app.py

Eylon Caplan

demographic instead of demo 2

c2d6aa5 7 days ago

23 kB

	import gradio as gr
	import pandas as pd
	import os
	import json
	import base64
	import plotly.express as px
	import plotly.graph_objects as go
	from core_logic import (
	query_bm25_index,
	lift_at_k,
	lift_ci,
	compute_keyword_similarity,
	warmup_bm25
	)

	def resolve_existing_path(candidates):
	for path in candidates:
	if path and os.path.exists(path):
	return path
	return next((path for path in candidates if path), "")


	INDEX_DIR = resolve_existing_path([
	os.getenv('BM25_INDEX_DIR'),
	'/data/bm25_indexes',
	'/data/bm25_indexes/gt_reranking_sets',
	'/data/ecaplan/splits-storage/bm25_indexes',
	'/homes/ecaplan/gscratch1/bm25_indexes/gt_reranking_sets'
	])

	# Resolve PSLP data from Space bucket first, then local fallback.
	TH3_DATA_PATH = resolve_existing_path([
	os.getenv('PSLP_DATA_PATH'),
	'/data/keyword_similarities_with_lifts_4.1.jsonl',
	'/data/keyword_similarities_with_lifts_4.1_sample.jsonl',
	'/homes/ecaplan/gscratch2/demo_dataset/keyword_similarities_with_lifts_4.1.jsonl'
	])

	print(f"Resolved INDEX_DIR: {INDEX_DIR}")
	print(f"Resolved TH3_DATA_PATH: {TH3_DATA_PATH}")

	try:
	if os.path.exists(TH3_DATA_PATH):
	PSLP_DF = pd.read_json(TH3_DATA_PATH, lines=True)
	else:
	PSLP_DF = pd.DataFrame()
	except Exception as e:
	print(f"Failed to load PSLP data from {TH3_DATA_PATH}: {e}")
	PSLP_DF = pd.DataFrame()

	CREATIVE_LEXICON_PATH = "/homes/ecaplan/subspace/keyword_similarities.jsonl"
	try:
	# Load this larger dataset strictly for the "Auto-fill Creative Lexicon" button
	CREATIVE_LEXICON_DF = pd.read_json(CREATIVE_LEXICON_PATH, lines=True)
	except Exception as e:
	print(f"Failed to load creative lexicon data from {CREATIVE_LEXICON_PATH}: {e}")
	CREATIVE_LEXICON_DF = PSLP_DF


	TRIVIAL_LEXICONS = {
	"black": "I am black, as a black person, being black, black history, African American, my identity as black, black community, black people, Afro-American, black heritage, growing up black, black culture, black pride, our black community, identify as black, black experiences, black lives, black voices, black families, black neighborhoods",
	"jewish": "I am Jewish, as a Jew, being Jewish, Judaism, Jewish people, Jewish history, Jewish community, my Jewish identity, growing up Jewish, Jewish heritage, synagogue, Torah, Rabbi, Yom Kippur, Hanukkah, Passover, Kosher, Jewish traditions, Shabbat, Jewish culture",
	"catholic": "I am Catholic, as a Catholic, being Catholic, Catholicism, Catholic church, Catholic faith, Catholic community, my Catholic identity, growing up Catholic, Catholic heritage, going to Mass, Eucharist, Vatican, Pope, Catholic priest, Rosary, Catholic traditions, Catholic school, catechism, Catholic teachings",
	"hindu_jain_sikh": "I am Hindu, I am Jain, I am Sikh, as a Hindu, as a Sikh, as a Jain, Hinduism, Jainism, Sikhism, my Hindu faith, my Sikh identity, my Jain beliefs, Hindu community, Sikh community, Jain community, growing up Hindu, growing up Sikh, growing up Jain, puja, Gurdwara",
	"construction": "I work in construction, as a construction worker, in the construction industry, being in construction, construction site, construction jobs, construction projects, building trades, general contractor, construction crew, working in construction, construction management, heavy equipment, hard hat, construction materials, building site, construction field, construction experience, construction company, building contractor",
	"teacher": "I am a teacher, as an educator, being a teacher, teaching profession, working in education, in my classroom, my students, teaching experience, school teacher, education field, my teaching career, lesson plans, classroom management, as a schoolteacher, public school teacher, teaching jobs, educator experience, teaching staff, special education, elementary teacher"
	}

	TRIVIALITY_THRESHOLD = 0.65

	CAT_ICON_PATH = resolve_existing_path([
	os.getenv('CAT_SPLITS_ICON_PATH'),
	os.path.join(os.path.dirname(__file__), 'cat_splits.png'),
	'/homes/ecaplan/gscratch2/splits_demo_app/cat_splits.png'
	])

	def load_icon_data_uri(path):
	if not path or not os.path.exists(path):
	return ""
	try:
	with open(path, "rb") as f:
	encoded = base64.b64encode(f.read()).decode("ascii")
	return f"data:image/png;base64,{encoded}"
	except Exception as e:
	print(f"Failed to load title icon from {path}: {e}")
	return ""

	CAT_ICON_SRC = load_icon_data_uri(CAT_ICON_PATH)
	TITLE_ICON_HTML = (
	f"<img src='{CAT_ICON_SRC}' alt='Splits cat icon' style='height:1em; width:auto; display:inline-block; flex:0 0 auto;'/>"
	if CAT_ICON_SRC else ""
	)

	BUTTON_FILL_CSS = """
	.lexicon-action-row {
	align-items: stretch !important;
	}

	#quick-triv-btn,
	#quick-creat-btn,
	#clear-btn {
	display: flex;
	align-self: stretch !important;
	}

	#quick-triv-btn button,
	#quick-creat-btn button,
	#clear-btn button {
	width: 100%;
	min-height: 44px;
	height: 100% !important;
	}
	"""

	def get_topics_for_demos(target, contrast):
	if not os.path.exists(INDEX_DIR):
	return [("No indices found", "")]

	dirs = [d for d in os.listdir(INDEX_DIR) if os.path.isdir(os.path.join(INDEX_DIR, d))]
	import re
	choices = []

	pair_regex = re.compile(fr"(_({target}\|{contrast})[-_]({target}\|{contrast}))")

	for d in dirs:
	if re.search(r"_[0-9]+-[0-9]+$", d):
	continue

	match = pair_regex.search(d)
	if not match:
	continue

	left_demo = match.group(2)
	right_demo = match.group(3)
	if left_demo == right_demo:
	continue
	if {left_demo, right_demo} != {target, contrast}:
	continue

	topic_part = d[:match.start()]
	# Clean display name (hide backend index suffixes)
	display_name = topic_part.replace('_', ' ').replace('-', '/')
	choices.append((display_name, d))

	choices.sort(key=lambda x: x[0])
	return choices if choices else [("No indices found for this pair", "")]

	def load_seed_words():
	try:
	with open("demo_seed_words.json", "r") as f:
	return json.load(f)
	except Exception:
	return {}

	def compute_triviality(lexicon: list, target_demo: str):
	seed_dict = load_seed_words()
	seed_words = seed_dict.get(target_demo, [])
	if not seed_words or not lexicon:
	return 0.0
	sim_metrics = compute_keyword_similarity(seed_words, lexicon, device='cpu')
	return sim_metrics['Recall']

	def generate_verdict_banner(lift, pval, triviality):
	is_trivial = triviality >= TRIVIALITY_THRESHOLD
	is_sig = lift > 1.0 and pval < 0.05

	# Use Gradio CSS vars and neutral styling so cards work in light and dark mode.
	base_style = "padding:20px; border-radius:12px; text-align:center; box-shadow: 0 4px 6px rgba(0,0,0,0.1); margin-bottom: 15px;"

	if is_trivial and is_sig:
	return f"<div style='{base_style} background-color: rgba(255, 193, 7, 0.2); border: 1px solid #ffe69c;'><h2 style='margin:0; color: #d39e00;'>🟡 Supported, but Trivial</h2><p style='margin-top:10px; font-size:16px;'>This lexicon successfully isolates the target demographic, but it is likely <strong>too obvious/definitional</strong> to be of interest.</p></div>"
	elif is_trivial and not is_sig:
	return f"<div style='{base_style} background-color: rgba(220, 53, 69, 0.2); border: 1px solid #f5c6cb;'><h2 style='margin:0; color: #c82333;'>🔴 Trivial & Unsupported</h2><p style='margin-top:10px; font-size:16px;'>This lexicon is <strong>definitional</strong> to the target demographic, and also failed to provide significant lift for the target demographic.</p></div>"
	elif not is_trivial and is_sig:
	return f"<div style='{base_style} background-color: rgba(40, 167, 69, 0.2); border: 1px solid #c3e6cb;'><h2 style='margin:0; color: #218838;'>🟢 Promising PSLP!</h2><p style='margin-top:10px; font-size:16px;'>This hypothesis is <strong>supported by the data</strong> (high lift) AND is <strong>unexpected</strong> (low triviality). Worthy of further study.</p></div>"
	else:
	return f"<div style='{base_style} background-color: rgba(108, 117, 125, 0.2); border: 1px solid #d6d8db;'><h2 style='margin:0; color: var(--body-text-color);'>⚪ Unsupported Hypothesis</h2><p style='margin-top:10px; font-size:16px;'>This lexicon is non-trivial, but the data <strong>does not support</strong> the hypothesis (it distinguishes the demographics no better than random).</p></div>"

	def format_demo(name):
	if not name:
	return name
	return "/".join(x.capitalize() for x in name.split('_'))

	def run_evaluation(index_name, target_demo, contrast_demo, generated_words_str):
	if not index_name or not generated_words_str:
	return "<div style='color:red;'>Please select a topic and enter a lexicon.</div>", "", "", px.scatter(title="Waiting for input..."), pd.DataFrame()

	try:
	generated_words = [w.strip() for w in generated_words_str.split(",") if w.strip()]
	index_path = os.path.join(INDEX_DIR, index_name)

	target_fmt = format_demo(target_demo)
	contrast_fmt = format_demo(contrast_demo)

	# 1. Compute BM25 Lifts
	df_results = query_bm25_index(index_path, generated_words)

	lift_0_5_percent = lift_at_k(df_results, target_demo, k=0.005)
	pval_0_5, _, _ = lift_ci(df_results, target_demo, k=0.005)

	card_style = "background-color: var(--block-background-fill); padding:20px; border-radius:12px; border: 1px solid var(--border-color-primary); box-shadow: 0 2px 4px rgba(0,0,0,0.05); height: 100%;"

	lift_text = f"<div style='{card_style}'><h3 style='margin:0; font-size:24px; color: var(--body-text-color);'>Lift@0.5%: <span style='color:#007bff;'>{lift_0_5_percent:.2f}x</span></h3><p style='margin:8px 0; color: var(--body-text-color-subdued);'><strong style='color: inherit;'>p-value: {pval_0_5:.4f}</strong></p><p style='margin:0; font-size:14px; color: var(--body-text-color-subdued);'>This lexicon pulled <b style='color: inherit;'>{target_fmt}</b> posts to the top {lift_0_5_percent:.2f}x more than random compared to <b style='color: inherit;'>{contrast_fmt}</b>.</p></div>"

	# 2. Compute Triviality
	triviality = compute_triviality(generated_words, target_demo)
	triv_text = f"<div style='{card_style}'><h3 style='margin:0; font-size:24px; color: var(--body-text-color);'>Triviality Score: <span style='color:#6f42c1;'>{triviality:.3f}</span></h3><p style='margin:8px 0; color: var(--body-text-color-subdued);'><strong style='color: inherit;'>Threshold: < {TRIVIALITY_THRESHOLD}</strong></p><p style='margin:0; font-size:14px; color: var(--body-text-color-subdued);'>Measures semantic similarity to the seed words of <b style='color: inherit;'>{target_fmt}</b>. Lower scores indicate a likely more unexpected PSLP.</p></div>"

	verdict = generate_verdict_banner(lift_0_5_percent, pval_0_5, triviality)

	# 3. 2D Filtration Plot
	import copy
	global tab3_plot
	if tab3_plot is not None:
	fig = copy.deepcopy(tab3_plot)
	else:
	fig = go.Figure()

	y_axis_cap = max(3.0, float(lift_0_5_percent) * 1.2)
	fig.add_shape(type="rect", x0=0, y0=1.0, x1=TRIVIALITY_THRESHOLD, y1=y_axis_cap, fillcolor="LightGreen", opacity=0.3, layer="below", line_width=0)
	fig.add_hline(y=1.0, line_dash="dash", line_color="gray")
	fig.add_vline(x=TRIVIALITY_THRESHOLD, line_dash="dash", line_color="gray")

	fig.add_trace(go.Scattergl(x=[triviality], y=[lift_0_5_percent], mode='markers+text', text=['Current PSLP'], textposition="top right", showlegend=False, marker=dict(size=20, symbol='star', color='red', line=dict(width=2, color='DarkSlateGrey'))))

	fig.update_layout(title="PSLP Filtration Map", showlegend=False, xaxis_title="Triviality (Lower = More Unexpected)", yaxis_title="Lift@0.5% (Higher = Stronger Data Support)", xaxis=dict(range=[0.0, 1.0]), yaxis=dict(range=[0.0, y_axis_cap]), margin=dict(l=40, r=40, t=40, b=40))

	# 4. Preview Top Hits
	top_hits_df = df_results.head(5)[['id', 'score', 'demographic', 'content']]

	return verdict, lift_text, triv_text, fig, top_hits_df
	except Exception as e:
	return f"<div style='color:red; padding:10px; border:1px solid red; border-radius:5px;'>Error evaluating hypothesis: {str(e)}</div>", "", "", px.scatter(title="Error"), pd.DataFrame()

	def load_tab3_data():
	try:
	if PSLP_DF.empty:
	return px.scatter(title=f"Could not find PSLP Space data.")
	df = PSLP_DF.copy()
	if "lift@0.5" not in df.columns:
	df["lift@0.5"] = df.get("lift@1.0", 0)
	if "demo1_recall" not in df.columns:
	df["demo1_recall"] = 0.5

	if 'keyword_type' in df.columns:
	types = df['keyword_type'].unique()
	if len(types) > 0:
	n_per_group = max(1, 3000 // len(types))
	sampled_dfs = []
	for t in types:
	group = df[df['keyword_type'] == t]
	sampled_dfs.append(group.sample(min(len(group), n_per_group)))
	df = pd.concat(sampled_dfs)

	fig = px.scatter(
	df,
	x="demo1_recall",
	y="lift@0.5",
	hover_data=["topic", "demo1", "demo2", "keyword_type", "keywords"],
	title="The Hypothesis Space (Pre-computed PSLPs)",
	labels={"demo1_recall": "Triviality", "lift@0.5": "Lift@0.5%"},
	opacity=0.4,
	color_discrete_sequence=['#4a90e2']
	)
	fig.update_traces(marker=dict(size=5, line=dict(width=0.5, color='white')))
	max_limit = df['lift@0.5'].max() if not df.empty else 2.0
	fig.add_shape(type="rect", x0=0, y0=1.0, x1=TRIVIALITY_THRESHOLD, y1=max_limit, fillcolor="LightGreen", opacity=0.3, layer="below", line_width=0)
	fig.add_hline(y=1.0, line_dash="dash", line_color="gray")
	fig.add_vline(x=TRIVIALITY_THRESHOLD, line_dash="dash", line_color="gray")
	fig.update_layout(margin=dict(l=40, r=40, t=40, b=40), showlegend=False)
	return fig
	except Exception as e:
	return px.scatter(title=f"Could not load data for the Hypothesis Space: {e}")

	print("Warming up BERT model...")
	try:
	warmup_choices = get_topics_for_demos("black", "teacher")
	warmup_index_name = next((idx for _, idx in warmup_choices if idx), None)
	if warmup_index_name:
	warmup_index_path = os.path.join(INDEX_DIR, warmup_index_name)
	print(f"Warming up BM25 index: {warmup_index_name}")
	warmup_bm25(warmup_index_path, warmup_keyword="community", warmup_k=1)
	else:
	print("BM25 warmup skipped: no default index found")
	except Exception as e:
	print(f"BM25 warmup failed: {e}")

	try:
	compute_keyword_similarity(["test"], ["warmup"], device="cpu")
	except Exception as e:
	print(f"BERT warmup failed: {e}")

	tab3_plot = load_tab3_data()

	# Utilize a clean, modern Gradio Theme
	with gr.Blocks(title="Splits! Sandbox", theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), css=BUTTON_FILL_CSS) as demo:
	# --- HEADER ---
	gr.Markdown(f"""
	<h1 style='display:flex; align-items:center; gap:0.35em; margin:0 0 0.2em 0;'>
	{TITLE_ICON_HTML}
	<span>Splits! Language & Culture Sandbox</span>
	</h1>

	Welcome to the companion demo for Splits!, created by [Eylon Caplan](https://eyloncaplan.github.io/). Explore how different sociocultural groups use entirely different vocabularies to discuss the same topics.

	<div style="margin-top: 16px; margin-bottom: 8px;">
	<a href="https://arxiv.org/abs/2504.04640" target="_blank"
	style="display: inline-flex; align-items: center; gap: 8px; background-color: var(--button-secondary-background-fill); border: 1px solid var(--border-color-primary); border-radius: 999px; padding: 8px 18px; text-decoration: none; color: var(--body-text-color); font-weight: 500; font-size: 0.95em; box-shadow: 0 1px 3px rgba(0,0,0,0.05);">
	📄 <b>Read the paper:</b> Splits! Flexible Sociocultural Linguistic Investigation at Scale
	<span style="color: var(--body-text-color-subdued); margin-left: 4px;">↗</span>
	</a>
	</div>
	""")

	with gr.Accordion("📖 What is this & How to use it?", open=False):
	gr.Markdown("""
	### 🤔 What is this?
	The way we speak is heavily influenced by our background and culture. This tool lets you explore how different groups of people (like teachers, construction workers, or people of different faiths) use entirely different vocabularies to discuss the same topics.

	### 🛠️ What can it do?
	Think of it as a search tool for testing cultural language trends. You can test your own guesses (hypotheses) about how people talk. For example, if both Jewish and Catholic people are talking about "Healthcare", do they focus on different things?

	You pick the groups, pick a topic, and type in some words. The tool will then crunch the data and tell you two things:
	1. 📊 Is it true? (Lift): Does your chosen group actually use these words more than the other group?
	2. 💡 Is it interesting? (Triviality): Are these words an unexpected, deep cultural insight? Or are they just boring, obvious terms (like a Catholic person using the word "church")?

	### ⚙️ How to use
	1. Select a Target group, a Contrast group, and a Discussion Topic.
	2. Provide a Candidate Lexicon (a list of words you guess the target group uses more than the contrast group for this topic).
	3. Click Test Hypothesis to see if the data supports your idea!

	""")

	# --- INPUTS ---
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Step 1: Set the Context")
	with gr.Group():
	with gr.Row():
	target_demo = gr.Dropdown(choices=["black", "jewish", "catholic", "hindu_jain_sikh", "construction", "teacher"], label="🎯 Target Demographic", value="black", scale=2)
	swap_btn = gr.Button("🔄 Swap", scale=1, min_width=60)
	contrast_demo = gr.Dropdown(choices=["black", "jewish", "catholic", "hindu_jain_sikh", "construction", "teacher"], label="⚖️ Contrast Demographic", value="teacher", scale=2)

	choices = get_topics_for_demos("black", "teacher")
	default_idx = choices[0][1] if choices and choices[0][1] != "" else None
	index_dropdown = gr.Dropdown(choices=choices, label="💬 Discussion Topic", value=default_idx)

	with gr.Column(scale=1):
	gr.Markdown("### 📝 Step 2: Define Lexicon")
	with gr.Group():
	lexicon_input = gr.Textbox(
	show_label=False,
	placeholder="e.g. word1, phrase two, word3...",
	info="Enter a comma-separated list of words/phrases you hypothesize the Target uses more than the Contrast in the selected Topic.",
	lines=3
	)
	with gr.Row(elem_classes=["lexicon-action-row"]):
	quick_triv = gr.Button("🟡 Auto-fill Trivial (Obvious)", elem_id="quick-triv-btn", scale=1)
	quick_creat = gr.Button("✨ Auto-fill Creative (LLM-Generated)", elem_id="quick-creat-btn", scale=1)
	clear_btn = gr.Button("🗑️ Clear", variant="secondary", elem_id="clear-btn", scale=1)

	test_btn = gr.Button("🚀 Test Hypothesis!", variant="primary", size="lg")

	gr.Markdown("---")

	verdict_out = gr.HTML()

	with gr.Row():
	lift_out = gr.HTML()
	triv_out = gr.HTML()

	plot_out = gr.Plot(value=tab3_plot)

	with gr.Accordion("🔍 View Top Retrieved Posts (Contextualize the Lexicon)", open=False):
	posts_out = gr.Dataframe(headers=["ID", "BM25 Score", "Demographic", "Content"], interactive=False)

	# --- Event Handlers ---
	def swap_demos(t, c):
	return c, t

	swap_btn.click(fn=swap_demos, inputs=[target_demo, contrast_demo], outputs=[target_demo, contrast_demo])

	def update_idx_choices(t, c):
	opts = get_topics_for_demos(t, c)
	default_val = opts[0][1] if opts and opts[0][1] != "" else None
	return gr.update(choices=opts, value=default_val)

	target_demo.change(fn=update_idx_choices, inputs=[target_demo, contrast_demo], outputs=[index_dropdown])
	contrast_demo.change(fn=update_idx_choices, inputs=[target_demo, contrast_demo], outputs=[index_dropdown])

	def fill_creative_lexicon(target, contrast, index_path):
	import re
	if CREATIVE_LEXICON_DF.empty:
	return "No data to sample from"

	folder_name = os.path.basename(str(index_path)) if index_path else ""
	match = re.search(fr"(_({target}\|{contrast})-({target}\|{contrast}))", folder_name)
	topic_display = ""
	if match:
	topic_part = folder_name[:match.start()]
	topic_display = topic_part.replace('_', ' ').replace('-', '/')

	subset = CREATIVE_LEXICON_DF[
	(CREATIVE_LEXICON_DF['demo1'] == target) & (CREATIVE_LEXICON_DF['demo2'] == contrast)
	]

	import difflib
	possible_topics = subset['topic'].unique()
	matches = difflib.get_close_matches(topic_display, possible_topics, n=1, cutoff=0.4)

	if matches:
	topic_subset = subset[subset['topic'] == matches[0]]
	if not topic_subset.empty:
	subset = topic_subset

	if subset.empty:
	return "Sample word 1, sample word 2"

	row = subset.sample(1).iloc[0]
	keywords = row['keywords']
	if isinstance(keywords, list):
	return ", ".join(keywords)
	return str(keywords)

	quick_triv.click(fn=lambda t: TRIVIAL_LEXICONS.get(t, "Trivial words..."), inputs=[target_demo], outputs=[lexicon_input])
	quick_creat.click(fn=fill_creative_lexicon, inputs=[target_demo, contrast_demo, index_dropdown], outputs=[lexicon_input])
	clear_btn.click(fn=lambda: "", outputs=[lexicon_input])

	test_btn.click(
	fn=run_evaluation,
	inputs=[index_dropdown, target_demo, contrast_demo, lexicon_input],
	outputs=[verdict_out, lift_out, triv_out, plot_out, posts_out]
	)

	if __name__ == "__main__":
	port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
	demo.launch(server_name="0.0.0.0", server_port=port, ssr_mode=False)