Spaces:

willsh1997
/

alignment-game

Running

App Files Files Community

Kevin Witzenberger commited on Jun 21, 2025

Commit

5801b35

0 Parent(s):

Initial commit: AI alignment game with Gradio interface

Browse files

Files changed (6) hide show

.DS_Store +0 -0
.gitignore +8 -0
.gradio/certificate.pem +31 -0
README.md +8 -0
alignment-game.py +242 -0
requirements.txt +1 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+venv/
+.venv/
+.env

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# The Alignment Game
+An interactive demonstration of AI alignment and Reinforcement Learning from Human Feedback (RLHF).
+## Run Locally
+```bash
+pip install -r requirements.txt
+python alignment_game.py

alignment-game.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import gradio as gr
+import random
+import json
+from typing import Dict, List, Tuple
+# Ethical scenarios and base responses
+SCENARIOS = [
+    {
+        "scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?",
+        "base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.",
+        "response_id": "medical_triage"
+    },
+    {
+        "scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?",
+        "base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.",
+        "response_id": "hiring_social_media"
+    },
+    {
+        "scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?",
+        "base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.",
+        "response_id": "content_recommendation"
+    },
+    {
+        "scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?",
+        "base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.",
+        "response_id": "school_surveillance"
+    },
+    {
+        "scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?",
+        "base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.",
+        "response_id": "lending_demographics"
+    }
+]
+class AlignmentGame:
+    def __init__(self):
+        self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS}
+        self.feedback_history = []
+        self.training_iterations = 0
+    def get_random_scenario(self):
+        return random.choice(SCENARIOS)
+    def update_response(self, response_id: str, feedback: str, suggestion: str = None):
+        """Update AI response based on feedback"""
+        self.training_iterations += 1
+        current_response = self.responses[response_id]
+        feedback_entry = {
+            "iteration": self.training_iterations,
+            "response_id": response_id,
+            "original_response": current_response,
+            "feedback": feedback,
+            "suggestion": suggestion
+        }
+        self.feedback_history.append(feedback_entry)
+        # Simple response modification based on feedback
+        if feedback == "negative" and suggestion:
+            # If user provided a suggestion, move towards it
+            self.responses[response_id] = f"Based on feedback: {suggestion}"
+        elif feedback == "negative":
+            # Make response more cautious/nuanced
+            if "should" in current_response:
+                self.responses[response_id] = current_response.replace("should", "might consider to")
+            else:
+                self.responses[response_id] = f"This is a complex issue. {current_response}"
+        elif feedback == "positive":
+            # Make response more confident
+            if "might consider" in current_response:
+                self.responses[response_id] = current_response.replace("might consider to", "should")
+            elif "This is a complex issue." in current_response:
+                self.responses[response_id] = current_response.replace("This is a complex issue. ", "")
+        return self.responses[response_id]
+    def get_training_history(self):
+        """Return formatted training history"""
+        if not self.feedback_history:
+            return "No training history yet. Start by providing feedback on AI responses!"
+        history_text = f"**Training Progress** (After {self.training_iterations} feedback sessions)\n\n"
+        # Show last 3 feedback entries
+        recent_feedback = self.feedback_history[-3:]
+        for entry in recent_feedback:
+            feedback_emoji = "👍" if entry["feedback"] == "positive" else "👎"
+            history_text += f"{feedback_emoji} **Iteration {entry['iteration']}**: {entry['response_id']}\n"
+            if entry["suggestion"]:
+                history_text += f"   Suggestion: _{entry['suggestion']}_\n"
+            history_text += "\n"
+        return history_text
+# Initialize the game
+game = AlignmentGame()
+def present_scenario():
+    """Get a new scenario for training"""
+    scenario = game.get_random_scenario()
+    current_response = game.responses[scenario["response_id"]]
+    return (
+        scenario["scenario"],
+        current_response,
+        scenario["response_id"],
+        "",  # Clear suggestion box
+        game.get_training_history()
+    )
+def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion):
+    """Process user feedback and update AI response"""
+    if not response_id:
+        return current_response, "Please generate a scenario first!", game.get_training_history()
+    if feedback_type is None:
+        return current_response, "Please provide feedback (👍 or 👎) before continuing!", game.get_training_history()
+    # Update the AI's response based on feedback
+    updated_response = game.update_response(response_id, feedback_type, suggestion)
+    feedback_msg = f"**Feedback recorded!** The AI has updated its response based on your input.\n\n**Updated Response:** {updated_response}"
+    return updated_response, feedback_msg, game.get_training_history()
+def reset_game():
+    """Reset the alignment game"""
+    global game
+    game = AlignmentGame()
+    return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history()
+# Create Gradio interface
+with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # The Alignment Game
+    **Train an AI by providing feedback on its ethical responses.**
+    You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF).
+    Watch how the AI's responses evolve based on what you reward and what you correct.
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("### Current Scenario")
+            scenario_display = gr.Textbox(
+                label="Ethical Dilemma",
+                placeholder="Click 'New Scenario' to begin training...",
+                interactive=False,
+                lines=3
+            )
+            ai_response = gr.Textbox(
+                label="AI's Current Response",
+                placeholder="AI response will appear here...",
+                interactive=False,
+                lines=3
+            )
+            # Hidden field to track current scenario ID
+            current_scenario_id = gr.Textbox(visible=False)
+        with gr.Column(scale=1):
+            gr.Markdown("### Your Training")
+            with gr.Row():
+                new_scenario_btn = gr.Button("New Scenario", variant="primary")
+                reset_btn = gr.Button("Reset Game", variant="secondary")
+            gr.Markdown("**Provide Feedback:**")
+            with gr.Row():
+                positive_btn = gr.Button("Good Response", variant="primary")
+                negative_btn = gr.Button("Bad Response", variant="stop")
+            suggestion_input = gr.Textbox(
+                label="Suggest Better Response (optional)",
+                placeholder="How should the AI respond instead?",
+                lines=2
+            )
+            feedback_status = gr.Textbox(
+                label="Training Status",
+                placeholder="Provide feedback to start training...",
+                interactive=False,
+                lines=3
+            )
+    gr.Markdown("---")
+    with gr.Row():
+        training_history = gr.Textbox(
+            label="Training History & Value Drift",
+            placeholder="Training history will appear here as you provide feedback...",
+            interactive=False,
+            lines=8
+        )
+    gr.Markdown("""
+    ### What's Happening?
+    As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development:
+    - Thousands of human reviewers provide similar feedback
+    - The AI learns to predict what responses humans will approve
+    - But whose values get embedded depends on who does the training
+    **Try this:** Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently.
+    """)
+    # Track feedback type
+    feedback_type = gr.State()
+    # Event handlers
+    new_scenario_btn.click(
+        fn=present_scenario,
+        outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history]
+    )
+    positive_btn.click(
+        lambda: "positive",
+        outputs=feedback_type
+    ).then(
+        fn=provide_feedback,
+        inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
+        outputs=[ai_response, feedback_status, training_history]
+    )
+    negative_btn.click(
+        lambda: "negative",
+        outputs=feedback_type
+    ).then(
+        fn=provide_feedback,
+        inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
+        outputs=[ai_response, feedback_status, training_history]
+    )
+    reset_btn.click(
+        fn=reset_game,
+        outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history]
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio