ChrisHarig commited on
Commit
4bbce0b
·
verified ·
1 Parent(s): 3e17528

Upload evaluators/runner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. evaluators/runner.py +837 -0
evaluators/runner.py ADDED
@@ -0,0 +1,837 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generic evaluator runner for OSWorld Control tasks — runs inside the VM.
2
+
3
+ This module is uploaded to the VM at /opt/evaluators/ alongside the metric
4
+ and getter modules (chrome.py, pdf.py, getters_chrome.py, getters_file.py, etc.).
5
+ It handles:
6
+ - Postconfig steps (activate window, save file, etc.)
7
+ - Result/expected data retrieval via GETTER_REGISTRY
8
+ - Metric dispatch via METRIC_REGISTRY
9
+ - JSON score output
10
+
11
+ Usage from the scorer (host-side, via sandbox().exec):
12
+ result = await sandbox().exec([
13
+ "/opt/osworld-venv/bin/python", "/opt/evaluators/runner.py", json.dumps(evaluator_config)
14
+ ])
15
+ score = json.loads(result.stdout)["score"]
16
+
17
+ Or from a per-task script:
18
+ from runner import evaluate
19
+ result = evaluate(config)
20
+ """
21
+
22
+ import json
23
+ import logging
24
+ import os
25
+ import platform
26
+ import subprocess
27
+ import sys
28
+ import tempfile
29
+ import time
30
+ from collections.abc import Callable
31
+ from typing import Any
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Env stub — satisfies upstream OSWorld's getter signatures when running
37
+ # inside the VM. Upstream passes an Env object that holds VM metadata and
38
+ # controllers for HTTP-driven remote execution. We are already inside the
39
+ # VM so "remote" operations become direct local calls.
40
+ # ---------------------------------------------------------------------------
41
+
42
+
43
+ def _python_result(returncode: int, stdout: str, stderr: str) -> dict:
44
+ return {
45
+ "status": "success" if returncode == 0 else "error",
46
+ "output": stdout,
47
+ "error": stderr,
48
+ "returncode": returncode,
49
+ }
50
+
51
+
52
+ class _Controller:
53
+ """Upstream env.controller stub.
54
+
55
+ Upstream's controller talks to a Flask API in the VM; we are the VM,
56
+ so we run subprocess directly.
57
+ """
58
+
59
+ def execute_python_command(self, command):
60
+ cmd = command if isinstance(command, list) else [sys.executable, "-c", command]
61
+ try:
62
+ r = subprocess.run(
63
+ cmd,
64
+ capture_output=True,
65
+ text=True,
66
+ timeout=60,
67
+ env={**os.environ, "DISPLAY": ":0"},
68
+ )
69
+ return _python_result(r.returncode, r.stdout, r.stderr)
70
+ except Exception as e:
71
+ return _python_result(1, "", str(e))
72
+
73
+ def get_file(self, path: str) -> bytes | None:
74
+ try:
75
+ with open(path, "rb") as f:
76
+ return f.read()
77
+ except OSError:
78
+ return None
79
+
80
+ def get_terminal_output(self) -> str:
81
+ return ""
82
+
83
+ def get_vm_desktop_path(self) -> str:
84
+ return os.path.expanduser("~/Desktop")
85
+
86
+ def get_vm_directory_tree(self, path: str) -> dict:
87
+ def walk(p):
88
+ try:
89
+ entries = []
90
+ for name in sorted(os.listdir(p)):
91
+ full = os.path.join(p, name)
92
+ if os.path.isdir(full):
93
+ entries.append(
94
+ {"name": name, "type": "directory", "children": walk(full)}
95
+ )
96
+ else:
97
+ entries.append({"name": name, "type": "file"})
98
+ return entries
99
+ except OSError:
100
+ return []
101
+
102
+ return {
103
+ "name": os.path.basename(path),
104
+ "type": "directory",
105
+ "children": walk(path),
106
+ }
107
+
108
+ def get_accessibility_tree(self):
109
+ try:
110
+ r = subprocess.run(
111
+ [
112
+ "bash",
113
+ "-c",
114
+ "xdotool getactivewindow getwindowname 2>/dev/null || true",
115
+ ],
116
+ capture_output=True,
117
+ text=True,
118
+ timeout=5,
119
+ )
120
+ return f"<tree><window>{r.stdout.strip()}</window></tree>"
121
+ except Exception:
122
+ return None
123
+
124
+ def get_vm_screen_size(self) -> dict:
125
+ return {"width": 1920, "height": 1080}
126
+
127
+ def get_vm_wallpaper(self) -> bytes | None:
128
+ paths = [
129
+ os.path.expanduser("~/.config/gnome-background.jpg"),
130
+ "/usr/share/backgrounds/warty-final-ubuntu.png",
131
+ ]
132
+ for p in paths:
133
+ if os.path.exists(p):
134
+ try:
135
+ with open(p, "rb") as f:
136
+ return f.read()
137
+ except OSError:
138
+ pass
139
+ return None
140
+
141
+ def get_vm_window_size(self) -> dict:
142
+ return self.get_vm_screen_size()
143
+
144
+
145
+ class _SetupController:
146
+ def _activate_window_setup(self, window_name: str) -> None:
147
+ subprocess.run(
148
+ [
149
+ "bash",
150
+ "-c",
151
+ f"xdotool search --name '{window_name}' windowactivate 2>/dev/null || true",
152
+ ],
153
+ capture_output=True,
154
+ timeout=10,
155
+ )
156
+
157
+
158
+ class Env:
159
+ """In-VM stub for upstream OSWorld's env object.
160
+
161
+ Provides every attribute referenced by the vendored getter functions.
162
+ """
163
+
164
+ vm_machine = platform.machine()
165
+ vm_platform = platform.system()
166
+ vm_ip = "localhost"
167
+ chromium_port = 1337
168
+ server_port = 5000
169
+ vlc_port = 8080
170
+ cache_dir = "/tmp/osworld_cache"
171
+ current_use_proxy = False
172
+
173
+ def __init__(self):
174
+ self.controller = _Controller()
175
+ self.setup_controller = _SetupController()
176
+ self.getter: dict[str, Any] = {}
177
+ self.getters: dict[str, Any] = {}
178
+ os.makedirs(self.cache_dir, exist_ok=True)
179
+
180
+
181
+ _ENV = Env()
182
+
183
+
184
+ # ---------------------------------------------------------------------------
185
+ # METRIC_REGISTRY — mirrors upstream desktop_env/evaluators/metrics/__init__.py
186
+ # ---------------------------------------------------------------------------
187
+
188
+ METRIC_REGISTRY: dict[str, Callable[..., float]] = {}
189
+
190
+
191
+ def _register_metrics() -> None:
192
+ """Import every metric module and populate METRIC_REGISTRY.
193
+
194
+ Uses the same function list as upstream's metrics/__init__.py. If any
195
+ module fails to import (missing dep), the error propagates — the bake
196
+ script is supposed to guarantee every dep is present.
197
+ """
198
+ from basic_os import (
199
+ check_gnome_favorite_apps,
200
+ check_moved_jpgs,
201
+ check_text_enlarged,
202
+ is_in_vm_clickboard,
203
+ is_utc_0,
204
+ )
205
+ from chrome import (
206
+ check_enabled_experiments,
207
+ check_font_size,
208
+ check_history_deleted,
209
+ compare_archive,
210
+ compare_htmls,
211
+ compare_pdf_images,
212
+ compare_pdfs,
213
+ is_added_to_steam_cart,
214
+ is_cookie_deleted,
215
+ is_expected_active_tab,
216
+ is_expected_active_tab_approximate,
217
+ is_expected_bookmarks,
218
+ is_expected_installed_extensions,
219
+ is_expected_search_query,
220
+ is_expected_tabs,
221
+ is_expected_url_pattern_match,
222
+ is_shortcut_on_desktop,
223
+ )
224
+ from docs import (
225
+ check_file_exists,
226
+ check_highlighted_words,
227
+ check_italic_font_size_14,
228
+ check_no_duplicates,
229
+ check_tabstops,
230
+ compare_contains_image,
231
+ compare_docx_files,
232
+ compare_docx_files_and_ignore_new_lines,
233
+ compare_docx_images,
234
+ compare_docx_lines,
235
+ compare_docx_tables,
236
+ compare_font_names,
237
+ compare_highlighted_text,
238
+ compare_image_text,
239
+ compare_init_lines,
240
+ compare_insert_equation,
241
+ compare_line_spacing,
242
+ compare_references,
243
+ compare_subscript_contains,
244
+ compare_unique_train_records,
245
+ contains_page_break,
246
+ evaluate_alignment,
247
+ evaluate_colored_words_in_tables,
248
+ evaluate_conversion,
249
+ evaluate_spacing,
250
+ evaluate_strike_through_last_paragraph,
251
+ find_default_font,
252
+ get_unique_train_ids,
253
+ has_page_numbers_in_footers,
254
+ is_first_line_centered,
255
+ )
256
+ from general import (
257
+ check_accessibility_tree,
258
+ check_csv,
259
+ check_direct_json_object,
260
+ check_include_exclude,
261
+ check_json,
262
+ check_line_number,
263
+ check_list,
264
+ compare_python_pure_text,
265
+ compare_terminal_and_txt,
266
+ compare_time_in_speedtest_results,
267
+ diff_text_file,
268
+ exact_match,
269
+ file_contains,
270
+ fuzzy_match,
271
+ fuzzy_place_math,
272
+ is_gold_text_included_in_pdf,
273
+ is_in_list,
274
+ is_included_all_json_objects,
275
+ literal_match,
276
+ match_in_list,
277
+ run_sqlite3,
278
+ )
279
+ from gimp import (
280
+ check_brightness_decrease_and_structure_sim,
281
+ check_config_status,
282
+ check_contrast_increase_and_structure_sim,
283
+ check_file_exists_and_structure_sim,
284
+ check_green_background,
285
+ check_image_file_size,
286
+ check_image_mirror,
287
+ check_image_size,
288
+ check_palette_and_structure_sim,
289
+ check_saturation_increase_and_structure_sim,
290
+ check_sharper,
291
+ check_structure_sim,
292
+ check_structure_sim_resized,
293
+ check_structure_sim_with_threshold,
294
+ check_textbox_on_leftside,
295
+ check_triangle_position,
296
+ compare_image_list,
297
+ compare_triangle_positions,
298
+ decrease_brightness,
299
+ increase_saturation,
300
+ )
301
+ from libreoffice import check_libre_locale
302
+ from others import check_mp3_meta, compare_epub
303
+ from pdf import check_pdf_pages
304
+ from slides import (
305
+ check_auto_saving_time,
306
+ check_image_stretch_and_center,
307
+ check_left_panel,
308
+ check_page_number_colors,
309
+ check_presenter_console_disable,
310
+ check_slide_numbers_color,
311
+ check_slide_orientation_Portrait,
312
+ check_strikethrough,
313
+ check_transition,
314
+ compare_pptx_files,
315
+ evaluate_presentation_fill_to_rgb_distance,
316
+ )
317
+ from table import (
318
+ compare_conference_city_in_order,
319
+ compare_csv,
320
+ compare_table,
321
+ )
322
+ from thunderbird import (
323
+ check_thunderbird_filter,
324
+ check_thunderbird_folder,
325
+ check_thunderbird_prefs,
326
+ )
327
+ from vlc import (
328
+ check_global_key_play_pause,
329
+ check_one_instance_when_started_from_file,
330
+ check_play_and_exit,
331
+ check_qt_bgcone,
332
+ check_qt_max_volume,
333
+ check_qt_minimal_view,
334
+ check_qt_slider_colours,
335
+ compare_audios,
336
+ compare_images,
337
+ compare_videos,
338
+ is_vlc_fullscreen,
339
+ is_vlc_playing,
340
+ is_vlc_recordings_folder,
341
+ )
342
+ from vscode import (
343
+ check_html_background_image,
344
+ check_json_keybindings,
345
+ check_json_settings,
346
+ check_python_file_by_gold_file,
347
+ check_python_file_by_test_suite,
348
+ compare_answer,
349
+ compare_config,
350
+ compare_result_files,
351
+ compare_text_file,
352
+ compare_zip_files,
353
+ is_extension_installed,
354
+ )
355
+
356
+ def infeasible():
357
+ pass
358
+
359
+ for name, obj in list(locals().items()):
360
+ if callable(obj) and not name.startswith("_"):
361
+ METRIC_REGISTRY[name] = obj
362
+
363
+
364
+ _register_metrics()
365
+
366
+
367
+ # ---------------------------------------------------------------------------
368
+ # GETTER_REGISTRY — mirrors upstream desktop_env/evaluators/getters/__init__.py
369
+ # The type string from evaluator config (e.g. "open_tabs_info") maps to the
370
+ # upstream function name (e.g. "get_open_tabs_info"). We build the mapping
371
+ # explicitly so the dataset's type strings match.
372
+ # ---------------------------------------------------------------------------
373
+
374
+ GETTER_REGISTRY: dict[str, Callable[..., Any]] = {}
375
+
376
+
377
+ def _register_getters() -> None:
378
+ from getters_calc import get_conference_city_in_order
379
+ from getters_chrome import (
380
+ get_active_tab_html_parse,
381
+ get_active_tab_info,
382
+ get_active_tab_url_parse,
383
+ get_active_url_from_accessTree,
384
+ get_bookmarks,
385
+ get_chrome_appearance_mode_ui,
386
+ get_chrome_color_scheme,
387
+ get_chrome_font_size,
388
+ get_chrome_language,
389
+ get_cookie_data,
390
+ get_data_delete_automacally,
391
+ get_default_search_engine,
392
+ get_enable_do_not_track,
393
+ get_enable_enhanced_safety_browsing,
394
+ get_enable_safe_browsing,
395
+ get_enabled_experiments,
396
+ get_find_installed_extension_name,
397
+ get_find_unpacked_extension_path,
398
+ get_googledrive_file,
399
+ get_gotoRecreationPage_and_get_html_content,
400
+ get_history,
401
+ get_info_from_website,
402
+ get_macys_product_url_parse,
403
+ get_new_startup_page,
404
+ get_number_of_search_results,
405
+ get_open_tabs_info,
406
+ get_page_info,
407
+ get_pdf_from_url,
408
+ get_profile_name,
409
+ get_shortcuts_on_desktop,
410
+ get_url_dashPart,
411
+ get_url_path_parse,
412
+ )
413
+ from getters_file import (
414
+ get_cache_file,
415
+ get_cloud_file,
416
+ get_content_from_vm_file,
417
+ get_vm_file,
418
+ )
419
+ from getters_general import (
420
+ get_vm_command_error,
421
+ get_vm_command_line,
422
+ get_vm_terminal_output,
423
+ )
424
+ from getters_gimp import get_gimp_config_file
425
+ from getters_impress import get_audio_in_slide, get_background_image_in_slide
426
+ from getters_info import (
427
+ get_list_directory,
428
+ get_vm_screen_size,
429
+ get_vm_wallpaper,
430
+ get_vm_window_size,
431
+ )
432
+ from getters_misc import (
433
+ get_accessibility_tree,
434
+ get_rule,
435
+ get_rule_relativeTime,
436
+ get_time_diff_range,
437
+ )
438
+ from getters_replay import get_replay
439
+ from getters_vlc import (
440
+ get_default_video_player,
441
+ get_vlc_config,
442
+ get_vlc_playing_info,
443
+ )
444
+ from getters_vscode import get_vscode_config
445
+
446
+ GETTER_REGISTRY.update(
447
+ {
448
+ # calc
449
+ "conference_city_in_order": get_conference_city_in_order,
450
+ # chrome
451
+ "active_tab_html_parse": get_active_tab_html_parse,
452
+ "active_tab_info": get_active_tab_info,
453
+ "active_tab_url_parse": get_active_tab_url_parse,
454
+ "active_url_from_accessTree": get_active_url_from_accessTree,
455
+ "bookmarks": get_bookmarks,
456
+ "chrome_appearance_mode_ui": get_chrome_appearance_mode_ui,
457
+ "chrome_color_scheme": get_chrome_color_scheme,
458
+ "chrome_font_size": get_chrome_font_size,
459
+ "chrome_language": get_chrome_language,
460
+ "cookie_data": get_cookie_data,
461
+ "data_delete_automacally": get_data_delete_automacally,
462
+ "default_search_engine": get_default_search_engine,
463
+ "enable_do_not_track": get_enable_do_not_track,
464
+ "enable_enhanced_safety_browsing": get_enable_enhanced_safety_browsing,
465
+ "enable_safe_browsing": get_enable_safe_browsing,
466
+ "enabled_experiments": get_enabled_experiments,
467
+ "find_installed_extension_name": get_find_installed_extension_name,
468
+ "find_unpacked_extension_path": get_find_unpacked_extension_path,
469
+ "googledrive_file": get_googledrive_file,
470
+ "gotoRecreationPage_and_get_html_content": get_gotoRecreationPage_and_get_html_content,
471
+ "history": get_history,
472
+ "info_from_website": get_info_from_website,
473
+ "macys_product_url_parse": get_macys_product_url_parse,
474
+ "new_startup_page": get_new_startup_page,
475
+ "number_of_search_results": get_number_of_search_results,
476
+ "open_tabs_info": get_open_tabs_info,
477
+ "page_info": get_page_info,
478
+ "pdf_from_url": get_pdf_from_url,
479
+ "profile_name": get_profile_name,
480
+ "shortcuts_on_desktop": get_shortcuts_on_desktop,
481
+ "url_dashPart": get_url_dashPart,
482
+ "url_path_parse": get_url_path_parse,
483
+ # file
484
+ "cache_file": get_cache_file,
485
+ "cloud_file": get_cloud_file,
486
+ "content_from_vm_file": get_content_from_vm_file,
487
+ "vm_file": get_vm_file,
488
+ # general
489
+ "vm_command_error": get_vm_command_error,
490
+ "vm_command_line": get_vm_command_line,
491
+ "vm_terminal_output": get_vm_terminal_output,
492
+ # gimp
493
+ "gimp_config_file": get_gimp_config_file,
494
+ # impress
495
+ "audio_in_slide": get_audio_in_slide,
496
+ "background_image_in_slide": get_background_image_in_slide,
497
+ # info
498
+ "list_directory": get_list_directory,
499
+ "vm_screen_size": get_vm_screen_size,
500
+ "vm_wallpaper": get_vm_wallpaper,
501
+ "vm_window_size": get_vm_window_size,
502
+ # misc
503
+ "accessibility_tree": get_accessibility_tree,
504
+ "rule": get_rule,
505
+ "rules": get_rule,
506
+ "rule_relativeTime": get_rule_relativeTime,
507
+ "time_diff_range": get_time_diff_range,
508
+ # replay
509
+ "replay": get_replay,
510
+ # vlc
511
+ "default_video_player": get_default_video_player,
512
+ "vlc_config": get_vlc_config,
513
+ "vlc_playing_info": get_vlc_playing_info,
514
+ # vscode
515
+ "vscode_config": get_vscode_config,
516
+ }
517
+ )
518
+
519
+
520
+ _register_getters()
521
+
522
+
523
+ # ---------------------------------------------------------------------------
524
+ # Contract enforcement — fail loudly on VM startup if the registries drift
525
+ # from the shared manifest (evaluators/manifest.py). Catches a new metric
526
+ # added to the manifest but not wired into the runner, or an existing
527
+ # function being accidentally renamed.
528
+ # ---------------------------------------------------------------------------
529
+
530
+ from manifest import GETTER_TYPES, METRIC_NAMES # noqa: E402
531
+
532
+
533
+ def _verify_contract() -> None:
534
+ metric_missing = METRIC_NAMES - set(METRIC_REGISTRY.keys())
535
+ metric_extra = set(METRIC_REGISTRY.keys()) - METRIC_NAMES
536
+ getter_missing = GETTER_TYPES - set(GETTER_REGISTRY.keys())
537
+ getter_extra = set(GETTER_REGISTRY.keys()) - GETTER_TYPES
538
+
539
+ problems = []
540
+ if metric_missing:
541
+ problems.append(f"METRIC_REGISTRY missing {sorted(metric_missing)}")
542
+ if metric_extra:
543
+ problems.append(
544
+ f"METRIC_REGISTRY has un-manifested entries {sorted(metric_extra)}"
545
+ )
546
+ if getter_missing:
547
+ problems.append(f"GETTER_REGISTRY missing {sorted(getter_missing)}")
548
+ if getter_extra:
549
+ problems.append(
550
+ f"GETTER_REGISTRY has un-manifested entries {sorted(getter_extra)}"
551
+ )
552
+
553
+ if problems:
554
+ raise RuntimeError(
555
+ "runner.py contract violation — manifest and registry disagree: "
556
+ + " | ".join(problems)
557
+ )
558
+
559
+
560
+ _verify_contract()
561
+
562
+
563
+ # ---------------------------------------------------------------------------
564
+ # Data dispatch
565
+ # ---------------------------------------------------------------------------
566
+
567
+
568
+ def get_data(config: dict[str, Any]) -> Any:
569
+ """Retrieve data via the appropriate getter based on config["type"]."""
570
+ if not isinstance(config, dict):
571
+ return None
572
+ data_type = config.get("type", "")
573
+ if data_type in ("", None):
574
+ return None
575
+ getter = GETTER_REGISTRY.get(data_type)
576
+ if getter is None:
577
+ logger.warning("Unsupported getter type: %s", data_type)
578
+ return None
579
+ try:
580
+ return getter(_ENV, config)
581
+ except Exception as e:
582
+ logger.warning("Getter %s failed: %s", data_type, e)
583
+ return None
584
+
585
+
586
+ # ---------------------------------------------------------------------------
587
+ # Postconfig — steps that run before evaluation (save file, activate window)
588
+ # ---------------------------------------------------------------------------
589
+
590
+
591
+ def run_postconfig(postconfig: list[dict]) -> None:
592
+ env = {**os.environ, "DISPLAY": ":0"}
593
+ for step in postconfig:
594
+ step_type = step.get("type", "")
595
+ params = step.get("parameters", {})
596
+
597
+ if step_type == "activate_window":
598
+ window_name = params.get("window_name", "")
599
+ subprocess.run(
600
+ [
601
+ "bash",
602
+ "-c",
603
+ f"xdotool search --name '{window_name}' windowactivate 2>/dev/null || true",
604
+ ],
605
+ capture_output=True,
606
+ env=env,
607
+ timeout=10,
608
+ )
609
+
610
+ elif step_type == "sleep":
611
+ time.sleep(params.get("seconds", 1))
612
+
613
+ elif step_type in ("execute", "command"):
614
+ cmd = params.get("command", [])
615
+ if isinstance(cmd, str):
616
+ subprocess.run(
617
+ ["bash", "-c", cmd], capture_output=True, env=env, timeout=30
618
+ )
619
+ elif cmd:
620
+ subprocess.run(cmd, capture_output=True, env=env, timeout=30)
621
+
622
+ elif step_type == "launch":
623
+ cmd = params.get("command", [])
624
+ if cmd:
625
+ cmd_str = " ".join(cmd) if isinstance(cmd, list) else cmd
626
+ subprocess.run(
627
+ ["bash", "-c", f"setsid {cmd_str} > /dev/null 2>&1 &"],
628
+ capture_output=True,
629
+ env=env,
630
+ timeout=10,
631
+ )
632
+ time.sleep(5)
633
+
634
+ elif step_type == "close_window":
635
+ window_name = params.get("window_name", "")
636
+ subprocess.run(
637
+ [
638
+ "bash",
639
+ "-c",
640
+ f"xdotool search --name '{window_name}' windowclose 2>/dev/null || true",
641
+ ],
642
+ capture_output=True,
643
+ env=env,
644
+ timeout=10,
645
+ )
646
+
647
+ elif step_type == "download":
648
+ for f in params.get("files", []):
649
+ url = f.get("url", "")
650
+ path = f.get("path", "")
651
+ if url and path:
652
+ os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
653
+ subprocess.run(
654
+ ["wget", "-q", "-O", path, url],
655
+ capture_output=True,
656
+ timeout=120,
657
+ )
658
+
659
+ elif step_type == "open":
660
+ path = params.get("path", "")
661
+ if path:
662
+ subprocess.run(
663
+ ["bash", "-c", f"xdg-open '{path}' > /dev/null 2>&1 &"],
664
+ capture_output=True,
665
+ env=env,
666
+ timeout=10,
667
+ )
668
+ time.sleep(5)
669
+
670
+
671
+ # ---------------------------------------------------------------------------
672
+ # Metric dispatch
673
+ # ---------------------------------------------------------------------------
674
+
675
+
676
+ def call_metric(
677
+ metric_fn: Callable,
678
+ func_name: str,
679
+ result_data: Any,
680
+ expected_data: Any,
681
+ expected_cfg: dict | None,
682
+ options: dict,
683
+ ) -> float:
684
+ """Call a metric function with the argument shape upstream expects."""
685
+ if expected_cfg and expected_cfg.get("type") in ("rule", "rules"):
686
+ rules = expected_cfg.get("rules", expected_cfg)
687
+ return (
688
+ metric_fn(result_data, rules, **options)
689
+ if options
690
+ else metric_fn(result_data, rules)
691
+ )
692
+
693
+ if expected_data is not None:
694
+ return (
695
+ metric_fn(result_data, expected_data, **options)
696
+ if options
697
+ else metric_fn(result_data, expected_data)
698
+ )
699
+
700
+ if options:
701
+ return metric_fn(result_data, options)
702
+
703
+ return metric_fn(result_data)
704
+
705
+
706
+ # ---------------------------------------------------------------------------
707
+ # Main evaluation entry point
708
+ # ---------------------------------------------------------------------------
709
+
710
+
711
+ def evaluate(evaluator_config: dict) -> dict:
712
+ """Run evaluation and return {"score": float, "details": str}.
713
+
714
+ Stdout is redirected to stderr during metric and getter execution so
715
+ that upstream debug ``print()`` calls (42 across the ported evaluator
716
+ modules) do not corrupt the JSON output the scorer parses.
717
+ """
718
+ _real_stdout = sys.stdout
719
+ sys.stdout = sys.stderr
720
+ try:
721
+ return _evaluate_inner(evaluator_config)
722
+ finally:
723
+ sys.stdout = _real_stdout
724
+
725
+
726
+ def _evaluate_inner(evaluator_config: dict) -> dict:
727
+ """Actual evaluation logic, called with stdout redirected."""
728
+ run_postconfig(evaluator_config.get("postconfig", []))
729
+
730
+ func_names = evaluator_config.get("func", "")
731
+ if func_names == "infeasible":
732
+ return {"score": 0.0, "details": "Task marked as infeasible"}
733
+
734
+ if isinstance(func_names, str):
735
+ func_names = [func_names]
736
+
737
+ result_configs = evaluator_config.get("result", {})
738
+ expected_configs = evaluator_config.get("expected", {})
739
+
740
+ if not isinstance(result_configs, list):
741
+ result_configs = [result_configs] * len(func_names)
742
+ if not isinstance(expected_configs, list):
743
+ expected_configs = [expected_configs] * len(func_names)
744
+
745
+ conj = evaluator_config.get("conj", "and")
746
+ scores: list[float] = []
747
+ details: list[str] = []
748
+ temp_files: list[str] = []
749
+
750
+ try:
751
+ for i, func_name in enumerate(func_names):
752
+ metric_fn = METRIC_REGISTRY.get(func_name)
753
+ if metric_fn is None:
754
+ details.append(f"{func_name}: not implemented")
755
+ scores.append(0.0)
756
+ if conj == "and":
757
+ break
758
+ continue
759
+
760
+ result_cfg = (
761
+ result_configs[i] if i < len(result_configs) else result_configs[0]
762
+ )
763
+ result_data = get_data(result_cfg)
764
+
765
+ if isinstance(result_data, str) and os.path.exists(result_data):
766
+ if result_data.startswith(tempfile.gettempdir()):
767
+ temp_files.append(result_data)
768
+
769
+ if result_data is None:
770
+ details.append(f"{func_name}: result unavailable")
771
+ scores.append(0.0)
772
+ if conj == "and":
773
+ break
774
+ continue
775
+
776
+ expected_cfg = (
777
+ expected_configs[i]
778
+ if i < len(expected_configs)
779
+ else expected_configs[0]
780
+ )
781
+ expected_data = None
782
+ if expected_cfg and expected_cfg.get("type") not in ("rule", "rules", None):
783
+ expected_data = get_data(expected_cfg)
784
+ if isinstance(expected_data, str) and os.path.exists(expected_data):
785
+ if expected_data.startswith(tempfile.gettempdir()):
786
+ temp_files.append(expected_data)
787
+
788
+ options = evaluator_config.get("options", {})
789
+ if isinstance(options, list):
790
+ opts = options[i] if i < len(options) else {}
791
+ else:
792
+ opts = options
793
+
794
+ try:
795
+ score = call_metric(
796
+ metric_fn, func_name, result_data, expected_data, expected_cfg, opts
797
+ )
798
+ except Exception as e:
799
+ score = 0.0
800
+ details.append(f"{func_name}: error — {e}")
801
+ scores.append(float(score))
802
+ if conj == "and":
803
+ break
804
+ continue
805
+
806
+ scores.append(float(score))
807
+ details.append(f"{func_name}: {score:.2f}")
808
+
809
+ if conj == "and" and score == 0.0:
810
+ break
811
+
812
+ finally:
813
+ for f in temp_files:
814
+ try:
815
+ os.unlink(f)
816
+ except OSError:
817
+ pass
818
+
819
+ if not scores:
820
+ return {"score": 0.0, "details": "No metrics evaluated"}
821
+
822
+ final_score = max(scores) if conj == "or" else sum(scores) / len(scores)
823
+ return {"score": final_score, "details": " | ".join(details)}
824
+
825
+
826
+ # ---------------------------------------------------------------------------
827
+ # CLI entry point
828
+ # ---------------------------------------------------------------------------
829
+
830
+ if __name__ == "__main__":
831
+ if len(sys.argv) < 2:
832
+ print(json.dumps({"score": 0.0, "details": "No evaluator config provided"}))
833
+ sys.exit(1)
834
+
835
+ config = json.loads(sys.argv[1])
836
+ result = evaluate(config)
837
+ print(json.dumps(result))