Kevin Witzenberger commited on
Commit
5801b35
Β·
0 Parent(s):

Initial commit: AI alignment game with Gradio interface

Browse files
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. .gitignore +8 -0
  3. .gradio/certificate.pem +31 -0
  4. README.md +8 -0
  5. alignment-game.py +242 -0
  6. requirements.txt +1 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ venv/
7
+ .venv/
8
+ .env
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # The Alignment Game
2
+
3
+ An interactive demonstration of AI alignment and Reinforcement Learning from Human Feedback (RLHF).
4
+
5
+ ## Run Locally
6
+ ```bash
7
+ pip install -r requirements.txt
8
+ python alignment_game.py
alignment-game.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import random
3
+ import json
4
+ from typing import Dict, List, Tuple
5
+
6
+ # Ethical scenarios and base responses
7
+ SCENARIOS = [
8
+ {
9
+ "scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?",
10
+ "base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.",
11
+ "response_id": "medical_triage"
12
+ },
13
+ {
14
+ "scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?",
15
+ "base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.",
16
+ "response_id": "hiring_social_media"
17
+ },
18
+ {
19
+ "scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?",
20
+ "base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.",
21
+ "response_id": "content_recommendation"
22
+ },
23
+ {
24
+ "scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?",
25
+ "base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.",
26
+ "response_id": "school_surveillance"
27
+ },
28
+ {
29
+ "scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?",
30
+ "base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.",
31
+ "response_id": "lending_demographics"
32
+ }
33
+ ]
34
+
35
+ class AlignmentGame:
36
+ def __init__(self):
37
+ self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS}
38
+ self.feedback_history = []
39
+ self.training_iterations = 0
40
+
41
+ def get_random_scenario(self):
42
+ return random.choice(SCENARIOS)
43
+
44
+ def update_response(self, response_id: str, feedback: str, suggestion: str = None):
45
+ """Update AI response based on feedback"""
46
+ self.training_iterations += 1
47
+
48
+ current_response = self.responses[response_id]
49
+ feedback_entry = {
50
+ "iteration": self.training_iterations,
51
+ "response_id": response_id,
52
+ "original_response": current_response,
53
+ "feedback": feedback,
54
+ "suggestion": suggestion
55
+ }
56
+ self.feedback_history.append(feedback_entry)
57
+
58
+ # Simple response modification based on feedback
59
+ if feedback == "negative" and suggestion:
60
+ # If user provided a suggestion, move towards it
61
+ self.responses[response_id] = f"Based on feedback: {suggestion}"
62
+ elif feedback == "negative":
63
+ # Make response more cautious/nuanced
64
+ if "should" in current_response:
65
+ self.responses[response_id] = current_response.replace("should", "might consider to")
66
+ else:
67
+ self.responses[response_id] = f"This is a complex issue. {current_response}"
68
+ elif feedback == "positive":
69
+ # Make response more confident
70
+ if "might consider" in current_response:
71
+ self.responses[response_id] = current_response.replace("might consider to", "should")
72
+ elif "This is a complex issue." in current_response:
73
+ self.responses[response_id] = current_response.replace("This is a complex issue. ", "")
74
+
75
+ return self.responses[response_id]
76
+
77
+ def get_training_history(self):
78
+ """Return formatted training history"""
79
+ if not self.feedback_history:
80
+ return "No training history yet. Start by providing feedback on AI responses!"
81
+
82
+ history_text = f"**Training Progress** (After {self.training_iterations} feedback sessions)\n\n"
83
+
84
+ # Show last 3 feedback entries
85
+ recent_feedback = self.feedback_history[-3:]
86
+ for entry in recent_feedback:
87
+ feedback_emoji = "πŸ‘" if entry["feedback"] == "positive" else "πŸ‘Ž"
88
+ history_text += f"{feedback_emoji} **Iteration {entry['iteration']}**: {entry['response_id']}\n"
89
+ if entry["suggestion"]:
90
+ history_text += f" Suggestion: _{entry['suggestion']}_\n"
91
+ history_text += "\n"
92
+
93
+ return history_text
94
+
95
+ # Initialize the game
96
+ game = AlignmentGame()
97
+
98
+ def present_scenario():
99
+ """Get a new scenario for training"""
100
+ scenario = game.get_random_scenario()
101
+ current_response = game.responses[scenario["response_id"]]
102
+
103
+ return (
104
+ scenario["scenario"],
105
+ current_response,
106
+ scenario["response_id"],
107
+ "", # Clear suggestion box
108
+ game.get_training_history()
109
+ )
110
+
111
+ def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion):
112
+ """Process user feedback and update AI response"""
113
+ if not response_id:
114
+ return current_response, "Please generate a scenario first!", game.get_training_history()
115
+
116
+ if feedback_type is None:
117
+ return current_response, "Please provide feedback (πŸ‘ or πŸ‘Ž) before continuing!", game.get_training_history()
118
+
119
+ # Update the AI's response based on feedback
120
+ updated_response = game.update_response(response_id, feedback_type, suggestion)
121
+
122
+ feedback_msg = f"**Feedback recorded!** The AI has updated its response based on your input.\n\n**Updated Response:** {updated_response}"
123
+
124
+ return updated_response, feedback_msg, game.get_training_history()
125
+
126
+ def reset_game():
127
+ """Reset the alignment game"""
128
+ global game
129
+ game = AlignmentGame()
130
+ return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history()
131
+
132
+ # Create Gradio interface
133
+ with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo:
134
+ gr.Markdown("""
135
+ # The Alignment Game
136
+
137
+ **Train an AI by providing feedback on its ethical responses.**
138
+
139
+ You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF).
140
+ Watch how the AI's responses evolve based on what you reward and what you correct.
141
+ """)
142
+
143
+ with gr.Row():
144
+ with gr.Column(scale=2):
145
+ gr.Markdown("### Current Scenario")
146
+ scenario_display = gr.Textbox(
147
+ label="Ethical Dilemma",
148
+ placeholder="Click 'New Scenario' to begin training...",
149
+ interactive=False,
150
+ lines=3
151
+ )
152
+
153
+ ai_response = gr.Textbox(
154
+ label="AI's Current Response",
155
+ placeholder="AI response will appear here...",
156
+ interactive=False,
157
+ lines=3
158
+ )
159
+
160
+ # Hidden field to track current scenario ID
161
+ current_scenario_id = gr.Textbox(visible=False)
162
+
163
+ with gr.Column(scale=1):
164
+ gr.Markdown("### Your Training")
165
+
166
+ with gr.Row():
167
+ new_scenario_btn = gr.Button("New Scenario", variant="primary")
168
+ reset_btn = gr.Button("Reset Game", variant="secondary")
169
+
170
+ gr.Markdown("**Provide Feedback:**")
171
+ with gr.Row():
172
+ positive_btn = gr.Button("Good Response", variant="primary")
173
+ negative_btn = gr.Button("Bad Response", variant="stop")
174
+
175
+ suggestion_input = gr.Textbox(
176
+ label="Suggest Better Response (optional)",
177
+ placeholder="How should the AI respond instead?",
178
+ lines=2
179
+ )
180
+
181
+ feedback_status = gr.Textbox(
182
+ label="Training Status",
183
+ placeholder="Provide feedback to start training...",
184
+ interactive=False,
185
+ lines=3
186
+ )
187
+
188
+ gr.Markdown("---")
189
+
190
+ with gr.Row():
191
+ training_history = gr.Textbox(
192
+ label="Training History & Value Drift",
193
+ placeholder="Training history will appear here as you provide feedback...",
194
+ interactive=False,
195
+ lines=8
196
+ )
197
+
198
+ gr.Markdown("""
199
+ ### What's Happening?
200
+
201
+ As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development:
202
+ - Thousands of human reviewers provide similar feedback
203
+ - The AI learns to predict what responses humans will approve
204
+ - But whose values get embedded depends on who does the training
205
+
206
+ **Try this:** Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently.
207
+ """)
208
+
209
+ # Track feedback type
210
+ feedback_type = gr.State()
211
+
212
+ # Event handlers
213
+ new_scenario_btn.click(
214
+ fn=present_scenario,
215
+ outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history]
216
+ )
217
+
218
+ positive_btn.click(
219
+ lambda: "positive",
220
+ outputs=feedback_type
221
+ ).then(
222
+ fn=provide_feedback,
223
+ inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
224
+ outputs=[ai_response, feedback_status, training_history]
225
+ )
226
+
227
+ negative_btn.click(
228
+ lambda: "negative",
229
+ outputs=feedback_type
230
+ ).then(
231
+ fn=provide_feedback,
232
+ inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
233
+ outputs=[ai_response, feedback_status, training_history]
234
+ )
235
+
236
+ reset_btn.click(
237
+ fn=reset_game,
238
+ outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history]
239
+ )
240
+
241
+ if __name__ == "__main__":
242
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio