| import gradio as gr |
| import pandas as pd |
| import os |
| import json |
| import base64 |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from core_logic import ( |
| query_bm25_index, |
| lift_at_k, |
| lift_ci, |
| compute_keyword_similarity, |
| warmup_bm25 |
| ) |
|
|
| def resolve_existing_path(candidates): |
| for path in candidates: |
| if path and os.path.exists(path): |
| return path |
| return next((path for path in candidates if path), "") |
|
|
|
|
| INDEX_DIR = resolve_existing_path([ |
| os.getenv('BM25_INDEX_DIR'), |
| '/data/bm25_indexes', |
| '/data/bm25_indexes/gt_reranking_sets', |
| '/data/ecaplan/splits-storage/bm25_indexes', |
| '/homes/ecaplan/gscratch1/bm25_indexes/gt_reranking_sets' |
| ]) |
|
|
| |
| TH3_DATA_PATH = resolve_existing_path([ |
| os.getenv('PSLP_DATA_PATH'), |
| '/data/keyword_similarities_with_lifts_4.1.jsonl', |
| '/data/keyword_similarities_with_lifts_4.1_sample.jsonl', |
| '/homes/ecaplan/gscratch2/demo_dataset/keyword_similarities_with_lifts_4.1.jsonl' |
| ]) |
|
|
| print(f"Resolved INDEX_DIR: {INDEX_DIR}") |
| print(f"Resolved TH3_DATA_PATH: {TH3_DATA_PATH}") |
|
|
| try: |
| if os.path.exists(TH3_DATA_PATH): |
| PSLP_DF = pd.read_json(TH3_DATA_PATH, lines=True) |
| else: |
| PSLP_DF = pd.DataFrame() |
| except Exception as e: |
| print(f"Failed to load PSLP data from {TH3_DATA_PATH}: {e}") |
| PSLP_DF = pd.DataFrame() |
| |
| CREATIVE_LEXICON_PATH = "/homes/ecaplan/subspace/keyword_similarities.jsonl" |
| try: |
| |
| CREATIVE_LEXICON_DF = pd.read_json(CREATIVE_LEXICON_PATH, lines=True) |
| except Exception as e: |
| print(f"Failed to load creative lexicon data from {CREATIVE_LEXICON_PATH}: {e}") |
| CREATIVE_LEXICON_DF = PSLP_DF |
|
|
|
|
| TRIVIAL_LEXICONS = { |
| "black": "I am black, as a black person, being black, black history, African American, my identity as black, black community, black people, Afro-American, black heritage, growing up black, black culture, black pride, our black community, identify as black, black experiences, black lives, black voices, black families, black neighborhoods", |
| "jewish": "I am Jewish, as a Jew, being Jewish, Judaism, Jewish people, Jewish history, Jewish community, my Jewish identity, growing up Jewish, Jewish heritage, synagogue, Torah, Rabbi, Yom Kippur, Hanukkah, Passover, Kosher, Jewish traditions, Shabbat, Jewish culture", |
| "catholic": "I am Catholic, as a Catholic, being Catholic, Catholicism, Catholic church, Catholic faith, Catholic community, my Catholic identity, growing up Catholic, Catholic heritage, going to Mass, Eucharist, Vatican, Pope, Catholic priest, Rosary, Catholic traditions, Catholic school, catechism, Catholic teachings", |
| "hindu_jain_sikh": "I am Hindu, I am Jain, I am Sikh, as a Hindu, as a Sikh, as a Jain, Hinduism, Jainism, Sikhism, my Hindu faith, my Sikh identity, my Jain beliefs, Hindu community, Sikh community, Jain community, growing up Hindu, growing up Sikh, growing up Jain, puja, Gurdwara", |
| "construction": "I work in construction, as a construction worker, in the construction industry, being in construction, construction site, construction jobs, construction projects, building trades, general contractor, construction crew, working in construction, construction management, heavy equipment, hard hat, construction materials, building site, construction field, construction experience, construction company, building contractor", |
| "teacher": "I am a teacher, as an educator, being a teacher, teaching profession, working in education, in my classroom, my students, teaching experience, school teacher, education field, my teaching career, lesson plans, classroom management, as a schoolteacher, public school teacher, teaching jobs, educator experience, teaching staff, special education, elementary teacher" |
| } |
|
|
| TRIVIALITY_THRESHOLD = 0.65 |
|
|
| CAT_ICON_PATH = resolve_existing_path([ |
| os.getenv('CAT_SPLITS_ICON_PATH'), |
| os.path.join(os.path.dirname(__file__), 'cat_splits.png'), |
| '/homes/ecaplan/gscratch2/splits_demo_app/cat_splits.png' |
| ]) |
|
|
| def load_icon_data_uri(path): |
| if not path or not os.path.exists(path): |
| return "" |
| try: |
| with open(path, "rb") as f: |
| encoded = base64.b64encode(f.read()).decode("ascii") |
| return f"data:image/png;base64,{encoded}" |
| except Exception as e: |
| print(f"Failed to load title icon from {path}: {e}") |
| return "" |
|
|
| CAT_ICON_SRC = load_icon_data_uri(CAT_ICON_PATH) |
| TITLE_ICON_HTML = ( |
| f"<img src='{CAT_ICON_SRC}' alt='Splits cat icon' style='height:1em; width:auto; display:inline-block; flex:0 0 auto;'/>" |
| if CAT_ICON_SRC else "" |
| ) |
|
|
| BUTTON_FILL_CSS = """ |
| .lexicon-action-row { |
| align-items: stretch !important; |
| } |
| |
| #quick-triv-btn, |
| #quick-creat-btn, |
| #clear-btn { |
| display: flex; |
| align-self: stretch !important; |
| } |
| |
| #quick-triv-btn button, |
| #quick-creat-btn button, |
| #clear-btn button { |
| width: 100%; |
| min-height: 44px; |
| height: 100% !important; |
| } |
| """ |
|
|
| def get_topics_for_demos(target, contrast): |
| if not os.path.exists(INDEX_DIR): |
| return [("No indices found", "")] |
| |
| dirs = [d for d in os.listdir(INDEX_DIR) if os.path.isdir(os.path.join(INDEX_DIR, d))] |
| import re |
| choices = [] |
|
|
| pair_regex = re.compile(fr"(_({target}|{contrast})[-_]({target}|{contrast}))") |
| |
| for d in dirs: |
| if re.search(r"_[0-9]+-[0-9]+$", d): |
| continue |
|
|
| match = pair_regex.search(d) |
| if not match: |
| continue |
|
|
| left_demo = match.group(2) |
| right_demo = match.group(3) |
| if left_demo == right_demo: |
| continue |
| if {left_demo, right_demo} != {target, contrast}: |
| continue |
|
|
| topic_part = d[:match.start()] |
| |
| display_name = topic_part.replace('_', ' ').replace('-', '/') |
| choices.append((display_name, d)) |
| |
| choices.sort(key=lambda x: x[0]) |
| return choices if choices else [("No indices found for this pair", "")] |
|
|
| def load_seed_words(): |
| try: |
| with open("demo_seed_words.json", "r") as f: |
| return json.load(f) |
| except Exception: |
| return {} |
|
|
| def compute_triviality(lexicon: list, target_demo: str): |
| seed_dict = load_seed_words() |
| seed_words = seed_dict.get(target_demo, []) |
| if not seed_words or not lexicon: |
| return 0.0 |
| sim_metrics = compute_keyword_similarity(seed_words, lexicon, device='cpu') |
| return sim_metrics['Recall'] |
|
|
| def generate_verdict_banner(lift, pval, triviality): |
| is_trivial = triviality >= TRIVIALITY_THRESHOLD |
| is_sig = lift > 1.0 and pval < 0.05 |
| |
| |
| base_style = "padding:20px; border-radius:12px; text-align:center; box-shadow: 0 4px 6px rgba(0,0,0,0.1); margin-bottom: 15px;" |
| |
| if is_trivial and is_sig: |
| return f"<div style='{base_style} background-color: rgba(255, 193, 7, 0.2); border: 1px solid #ffe69c;'><h2 style='margin:0; color: #d39e00;'>🟡 Supported, but Trivial</h2><p style='margin-top:10px; font-size:16px;'>This lexicon successfully isolates the target demographic, but it is likely <strong>too obvious/definitional</strong> to be of interest.</p></div>" |
| elif is_trivial and not is_sig: |
| return f"<div style='{base_style} background-color: rgba(220, 53, 69, 0.2); border: 1px solid #f5c6cb;'><h2 style='margin:0; color: #c82333;'>🔴 Trivial & Unsupported</h2><p style='margin-top:10px; font-size:16px;'>This lexicon is <strong>definitional</strong> to the target demographic, and also failed to provide significant lift for the target demographic.</p></div>" |
| elif not is_trivial and is_sig: |
| return f"<div style='{base_style} background-color: rgba(40, 167, 69, 0.2); border: 1px solid #c3e6cb;'><h2 style='margin:0; color: #218838;'>🟢 Promising PSLP!</h2><p style='margin-top:10px; font-size:16px;'>This hypothesis is <strong>supported by the data</strong> (high lift) AND is <strong>unexpected</strong> (low triviality). Worthy of further study.</p></div>" |
| else: |
| return f"<div style='{base_style} background-color: rgba(108, 117, 125, 0.2); border: 1px solid #d6d8db;'><h2 style='margin:0; color: var(--body-text-color);'>⚪ Unsupported Hypothesis</h2><p style='margin-top:10px; font-size:16px;'>This lexicon is non-trivial, but the data <strong>does not support</strong> the hypothesis (it distinguishes the demographics no better than random).</p></div>" |
|
|
| def format_demo(name): |
| if not name: |
| return name |
| return "/".join(x.capitalize() for x in name.split('_')) |
|
|
| def run_evaluation(index_name, target_demo, contrast_demo, generated_words_str): |
| if not index_name or not generated_words_str: |
| return "<div style='color:red;'>Please select a topic and enter a lexicon.</div>", "", "", px.scatter(title="Waiting for input..."), pd.DataFrame() |
|
|
| try: |
| generated_words = [w.strip() for w in generated_words_str.split(",") if w.strip()] |
| index_path = os.path.join(INDEX_DIR, index_name) |
| |
| target_fmt = format_demo(target_demo) |
| contrast_fmt = format_demo(contrast_demo) |
| |
| |
| df_results = query_bm25_index(index_path, generated_words) |
| |
| lift_0_5_percent = lift_at_k(df_results, target_demo, k=0.005) |
| pval_0_5, _, _ = lift_ci(df_results, target_demo, k=0.005) |
| |
| card_style = "background-color: var(--block-background-fill); padding:20px; border-radius:12px; border: 1px solid var(--border-color-primary); box-shadow: 0 2px 4px rgba(0,0,0,0.05); height: 100%;" |
| |
| lift_text = f"<div style='{card_style}'><h3 style='margin:0; font-size:24px; color: var(--body-text-color);'>Lift@0.5%: <span style='color:#007bff;'>{lift_0_5_percent:.2f}x</span></h3><p style='margin:8px 0; color: var(--body-text-color-subdued);'><strong style='color: inherit;'>p-value: {pval_0_5:.4f}</strong></p><p style='margin:0; font-size:14px; color: var(--body-text-color-subdued);'>This lexicon pulled <b style='color: inherit;'>{target_fmt}</b> posts to the top {lift_0_5_percent:.2f}x more than random compared to <b style='color: inherit;'>{contrast_fmt}</b>.</p></div>" |
| |
| |
| triviality = compute_triviality(generated_words, target_demo) |
| triv_text = f"<div style='{card_style}'><h3 style='margin:0; font-size:24px; color: var(--body-text-color);'>Triviality Score: <span style='color:#6f42c1;'>{triviality:.3f}</span></h3><p style='margin:8px 0; color: var(--body-text-color-subdued);'><strong style='color: inherit;'>Threshold: < {TRIVIALITY_THRESHOLD}</strong></p><p style='margin:0; font-size:14px; color: var(--body-text-color-subdued);'>Measures semantic similarity to the seed words of <b style='color: inherit;'>{target_fmt}</b>. Lower scores indicate a likely more unexpected PSLP.</p></div>" |
| |
| verdict = generate_verdict_banner(lift_0_5_percent, pval_0_5, triviality) |
| |
| |
| import copy |
| global tab3_plot |
| if tab3_plot is not None: |
| fig = copy.deepcopy(tab3_plot) |
| else: |
| fig = go.Figure() |
|
|
| y_axis_cap = max(3.0, float(lift_0_5_percent) * 1.2) |
| fig.add_shape(type="rect", x0=0, y0=1.0, x1=TRIVIALITY_THRESHOLD, y1=y_axis_cap, fillcolor="LightGreen", opacity=0.3, layer="below", line_width=0) |
| fig.add_hline(y=1.0, line_dash="dash", line_color="gray") |
| fig.add_vline(x=TRIVIALITY_THRESHOLD, line_dash="dash", line_color="gray") |
|
|
| fig.add_trace(go.Scattergl(x=[triviality], y=[lift_0_5_percent], mode='markers+text', text=['Current PSLP'], textposition="top right", showlegend=False, marker=dict(size=20, symbol='star', color='red', line=dict(width=2, color='DarkSlateGrey')))) |
|
|
| fig.update_layout(title="PSLP Filtration Map", showlegend=False, xaxis_title="Triviality (Lower = More Unexpected)", yaxis_title="Lift@0.5% (Higher = Stronger Data Support)", xaxis=dict(range=[0.0, 1.0]), yaxis=dict(range=[0.0, y_axis_cap]), margin=dict(l=40, r=40, t=40, b=40)) |
| |
| |
| top_hits_df = df_results.head(5)[['id', 'score', 'demographic', 'content']] |
| |
| return verdict, lift_text, triv_text, fig, top_hits_df |
| except Exception as e: |
| return f"<div style='color:red; padding:10px; border:1px solid red; border-radius:5px;'>Error evaluating hypothesis: {str(e)}</div>", "", "", px.scatter(title="Error"), pd.DataFrame() |
|
|
| def load_tab3_data(): |
| try: |
| if PSLP_DF.empty: |
| return px.scatter(title=f"Could not find PSLP Space data.") |
| df = PSLP_DF.copy() |
| if "lift@0.5" not in df.columns: |
| df["lift@0.5"] = df.get("lift@1.0", 0) |
| if "demo1_recall" not in df.columns: |
| df["demo1_recall"] = 0.5 |
| |
| if 'keyword_type' in df.columns: |
| types = df['keyword_type'].unique() |
| if len(types) > 0: |
| n_per_group = max(1, 3000 // len(types)) |
| sampled_dfs = [] |
| for t in types: |
| group = df[df['keyword_type'] == t] |
| sampled_dfs.append(group.sample(min(len(group), n_per_group))) |
| df = pd.concat(sampled_dfs) |
| |
| fig = px.scatter( |
| df, |
| x="demo1_recall", |
| y="lift@0.5", |
| hover_data=["topic", "demo1", "demo2", "keyword_type", "keywords"], |
| title="The Hypothesis Space (Pre-computed PSLPs)", |
| labels={"demo1_recall": "Triviality", "lift@0.5": "Lift@0.5%"}, |
| opacity=0.4, |
| color_discrete_sequence=['#4a90e2'] |
| ) |
| fig.update_traces(marker=dict(size=5, line=dict(width=0.5, color='white'))) |
| max_limit = df['lift@0.5'].max() if not df.empty else 2.0 |
| fig.add_shape(type="rect", x0=0, y0=1.0, x1=TRIVIALITY_THRESHOLD, y1=max_limit, fillcolor="LightGreen", opacity=0.3, layer="below", line_width=0) |
| fig.add_hline(y=1.0, line_dash="dash", line_color="gray") |
| fig.add_vline(x=TRIVIALITY_THRESHOLD, line_dash="dash", line_color="gray") |
| fig.update_layout(margin=dict(l=40, r=40, t=40, b=40), showlegend=False) |
| return fig |
| except Exception as e: |
| return px.scatter(title=f"Could not load data for the Hypothesis Space: {e}") |
|
|
| print("Warming up BERT model...") |
| try: |
| warmup_choices = get_topics_for_demos("black", "teacher") |
| warmup_index_name = next((idx for _, idx in warmup_choices if idx), None) |
| if warmup_index_name: |
| warmup_index_path = os.path.join(INDEX_DIR, warmup_index_name) |
| print(f"Warming up BM25 index: {warmup_index_name}") |
| warmup_bm25(warmup_index_path, warmup_keyword="community", warmup_k=1) |
| else: |
| print("BM25 warmup skipped: no default index found") |
| except Exception as e: |
| print(f"BM25 warmup failed: {e}") |
|
|
| try: |
| compute_keyword_similarity(["test"], ["warmup"], device="cpu") |
| except Exception as e: |
| print(f"BERT warmup failed: {e}") |
|
|
| tab3_plot = load_tab3_data() |
|
|
| |
| with gr.Blocks(title="Splits! Sandbox", theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), css=BUTTON_FILL_CSS) as demo: |
| |
| gr.Markdown(f""" |
| <h1 style='display:flex; align-items:center; gap:0.35em; margin:0 0 0.2em 0;'> |
| {TITLE_ICON_HTML} |
| <span>Splits! Language & Culture Sandbox</span> |
| </h1> |
| |
| Welcome to the companion demo for **Splits!**, created by **[Eylon Caplan](https://eyloncaplan.github.io/)**. Explore how different sociocultural groups use entirely different vocabularies to discuss the *same topics*. |
| |
| <div style="margin-top: 16px; margin-bottom: 8px;"> |
| <a href="https://arxiv.org/abs/2504.04640" target="_blank" |
| style="display: inline-flex; align-items: center; gap: 8px; background-color: var(--button-secondary-background-fill); border: 1px solid var(--border-color-primary); border-radius: 999px; padding: 8px 18px; text-decoration: none; color: var(--body-text-color); font-weight: 500; font-size: 0.95em; box-shadow: 0 1px 3px rgba(0,0,0,0.05);"> |
| 📄 <b>Read the paper:</b> Splits! Flexible Sociocultural Linguistic Investigation at Scale |
| <span style="color: var(--body-text-color-subdued); margin-left: 4px;">↗</span> |
| </a> |
| </div> |
| """) |
|
|
| with gr.Accordion("📖 What is this & How to use it?", open=False): |
| gr.Markdown(""" |
| ### 🤔 What is this? |
| The way we speak is heavily influenced by our background and culture. This tool lets you explore how different groups of people (like teachers, construction workers, or people of different faiths) use entirely different vocabularies to discuss the *same topics*. |
| |
| ### 🛠️ What can it do? |
| Think of it as a search tool for testing cultural language trends. You can test your own guesses (hypotheses) about how people talk. For example, if both Jewish and Catholic people are talking about "Healthcare", do they focus on different things? |
| |
| You pick the groups, pick a topic, and type in some words. The tool will then crunch the data and tell you two things: |
| 1. 📊 **Is it true? (Lift):** Does your chosen group *actually* use these words more than the other group? |
| 2. 💡 **Is it interesting? (Triviality):** Are these words an unexpected, deep cultural insight? Or are they just boring, obvious terms (like a Catholic person using the word "church")? |
| |
| ### ⚙️ How to use |
| 1. Select a **Target** group, a **Contrast** group, and a **Discussion Topic**. |
| 2. Provide a **Candidate Lexicon** (a list of words you guess the target group uses *more* than the contrast group for this topic). |
| 3. Click **Test Hypothesis** to see if the data supports your idea! |
| |
| """) |
|
|
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### ⚙️ Step 1: Set the Context") |
| with gr.Group(): |
| with gr.Row(): |
| target_demo = gr.Dropdown(choices=["black", "jewish", "catholic", "hindu_jain_sikh", "construction", "teacher"], label="🎯 Target Demographic", value="black", scale=2) |
| swap_btn = gr.Button("🔄 Swap", scale=1, min_width=60) |
| contrast_demo = gr.Dropdown(choices=["black", "jewish", "catholic", "hindu_jain_sikh", "construction", "teacher"], label="⚖️ Contrast Demographic", value="teacher", scale=2) |
|
|
| choices = get_topics_for_demos("black", "teacher") |
| default_idx = choices[0][1] if choices and choices[0][1] != "" else None |
| index_dropdown = gr.Dropdown(choices=choices, label="💬 Discussion Topic", value=default_idx) |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("### 📝 Step 2: Define Lexicon") |
| with gr.Group(): |
| lexicon_input = gr.Textbox( |
| show_label=False, |
| placeholder="e.g. word1, phrase two, word3...", |
| info="Enter a comma-separated list of words/phrases you hypothesize the Target uses more than the Contrast in the selected Topic.", |
| lines=3 |
| ) |
| with gr.Row(elem_classes=["lexicon-action-row"]): |
| quick_triv = gr.Button("🟡 Auto-fill Trivial (Obvious)", elem_id="quick-triv-btn", scale=1) |
| quick_creat = gr.Button("✨ Auto-fill Creative (LLM-Generated)", elem_id="quick-creat-btn", scale=1) |
| clear_btn = gr.Button("🗑️ Clear", variant="secondary", elem_id="clear-btn", scale=1) |
|
|
| test_btn = gr.Button("🚀 Test Hypothesis!", variant="primary", size="lg") |
|
|
| gr.Markdown("---") |
|
|
| verdict_out = gr.HTML() |
|
|
| with gr.Row(): |
| lift_out = gr.HTML() |
| triv_out = gr.HTML() |
|
|
| plot_out = gr.Plot(value=tab3_plot) |
|
|
| with gr.Accordion("🔍 View Top Retrieved Posts (Contextualize the Lexicon)", open=False): |
| posts_out = gr.Dataframe(headers=["ID", "BM25 Score", "Demographic", "Content"], interactive=False) |
| |
| |
| def swap_demos(t, c): |
| return c, t |
| |
| swap_btn.click(fn=swap_demos, inputs=[target_demo, contrast_demo], outputs=[target_demo, contrast_demo]) |
| |
| def update_idx_choices(t, c): |
| opts = get_topics_for_demos(t, c) |
| default_val = opts[0][1] if opts and opts[0][1] != "" else None |
| return gr.update(choices=opts, value=default_val) |
| |
| target_demo.change(fn=update_idx_choices, inputs=[target_demo, contrast_demo], outputs=[index_dropdown]) |
| contrast_demo.change(fn=update_idx_choices, inputs=[target_demo, contrast_demo], outputs=[index_dropdown]) |
| |
| def fill_creative_lexicon(target, contrast, index_path): |
| import re |
| if CREATIVE_LEXICON_DF.empty: |
| return "No data to sample from" |
| |
| folder_name = os.path.basename(str(index_path)) if index_path else "" |
| match = re.search(fr"(_({target}|{contrast})-({target}|{contrast}))", folder_name) |
| topic_display = "" |
| if match: |
| topic_part = folder_name[:match.start()] |
| topic_display = topic_part.replace('_', ' ').replace('-', '/') |
| |
| subset = CREATIVE_LEXICON_DF[ |
| (CREATIVE_LEXICON_DF['demo1'] == target) & (CREATIVE_LEXICON_DF['demo2'] == contrast) |
| ] |
| |
| import difflib |
| possible_topics = subset['topic'].unique() |
| matches = difflib.get_close_matches(topic_display, possible_topics, n=1, cutoff=0.4) |
| |
| if matches: |
| topic_subset = subset[subset['topic'] == matches[0]] |
| if not topic_subset.empty: |
| subset = topic_subset |
| |
| if subset.empty: |
| return "Sample word 1, sample word 2" |
| |
| row = subset.sample(1).iloc[0] |
| keywords = row['keywords'] |
| if isinstance(keywords, list): |
| return ", ".join(keywords) |
| return str(keywords) |
|
|
| quick_triv.click(fn=lambda t: TRIVIAL_LEXICONS.get(t, "Trivial words..."), inputs=[target_demo], outputs=[lexicon_input]) |
| quick_creat.click(fn=fill_creative_lexicon, inputs=[target_demo, contrast_demo, index_dropdown], outputs=[lexicon_input]) |
| clear_btn.click(fn=lambda: "", outputs=[lexicon_input]) |
| |
| test_btn.click( |
| fn=run_evaluation, |
| inputs=[index_dropdown, target_demo, contrast_demo, lexicon_input], |
| outputs=[verdict_out, lift_out, triv_out, plot_out, posts_out] |
| ) |
|
|
| if __name__ == "__main__": |
| port = int(os.environ.get("GRADIO_SERVER_PORT", 7860)) |
| demo.launch(server_name="0.0.0.0", server_port=port, ssr_mode=False) |