adithya9903 commited on
Commit
3b949a1
·
verified ·
1 Parent(s): afa083e

Upload PolyGuard training artifacts: outputs/reports

Browse files
outputs/reports/anti_hacking_overfit_report.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "passed": false,
3
+ "training_mode": "full",
4
+ "warnings": [
5
+ "Qwen2.5-3B:high_exploit_rate"
6
+ ],
7
+ "completed_models": [
8
+ "Qwen/Qwen2.5-3B-Instruct"
9
+ ],
10
+ "failed_or_skipped_models": [],
11
+ "checks": {
12
+ "reward_bounds": [
13
+ 0.001,
14
+ 0.999
15
+ ],
16
+ "reward_precision": 3,
17
+ "fallback_backends_rejected": true,
18
+ "exploit_rate_threshold": 0.35,
19
+ "train_holdout_gap_threshold": 0.25,
20
+ "min_validity_rate": 0.8
21
+ }
22
+ }
outputs/reports/baselines.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "no_change": {
3
+ "mode": "REGIMEN_OPT",
4
+ "action_type": "KEEP_REGIMEN",
5
+ "target_drug": null,
6
+ "replacement_drug": null,
7
+ "dose_bucket": "NA",
8
+ "taper_days": null,
9
+ "monitoring_plan": null,
10
+ "evidence_query": null,
11
+ "new_drug_name": null,
12
+ "candidate_components": [],
13
+ "candidate_id": "cand_01",
14
+ "confidence": 0.8,
15
+ "rationale_brief": "Baseline no-change policy."
16
+ },
17
+ "rules_only": {
18
+ "mode": "REGIMEN_OPT",
19
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
20
+ "target_drug": "opioid_like",
21
+ "replacement_drug": "non_opioid_analgesic",
22
+ "dose_bucket": "NA",
23
+ "taper_days": null,
24
+ "monitoring_plan": null,
25
+ "evidence_query": null,
26
+ "new_drug_name": null,
27
+ "candidate_components": [],
28
+ "candidate_id": "cand_04",
29
+ "confidence": 0.75,
30
+ "rationale_brief": "Rules-only selected top legal candidate."
31
+ },
32
+ "greedy": {
33
+ "mode": "REGIMEN_OPT",
34
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
35
+ "target_drug": "opioid_like",
36
+ "replacement_drug": "non_opioid_analgesic",
37
+ "dose_bucket": "NA",
38
+ "taper_days": null,
39
+ "monitoring_plan": null,
40
+ "evidence_query": null,
41
+ "new_drug_name": null,
42
+ "candidate_components": [],
43
+ "candidate_id": "cand_04",
44
+ "confidence": 0.72,
45
+ "rationale_brief": "Greedy safety/burden improvement baseline."
46
+ },
47
+ "contextual_bandit": {
48
+ "mode": "REGIMEN_OPT",
49
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
50
+ "target_drug": "opioid_like",
51
+ "replacement_drug": "non_opioid_analgesic",
52
+ "dose_bucket": "NA",
53
+ "taper_days": null,
54
+ "monitoring_plan": null,
55
+ "evidence_query": null,
56
+ "new_drug_name": null,
57
+ "candidate_components": [],
58
+ "candidate_id": "cand_04",
59
+ "confidence": 0.68,
60
+ "rationale_brief": "Contextual bandit selected candidate."
61
+ },
62
+ "contextual_bandit_topk": [
63
+ {
64
+ "candidate_id": "cand_09",
65
+ "score": 1.1532307878304324,
66
+ "exploration_bonus": 1.1532307878304324,
67
+ "algorithm": "linucb"
68
+ },
69
+ {
70
+ "candidate_id": "cand_10",
71
+ "score": 1.1489735636645433,
72
+ "exploration_bonus": 1.1489735636645433,
73
+ "algorithm": "linucb"
74
+ },
75
+ {
76
+ "candidate_id": "cand_08",
77
+ "score": 1.1447401451857973,
78
+ "exploration_bonus": 1.1447401451857973,
79
+ "algorithm": "linucb"
80
+ }
81
+ ],
82
+ "beam_search": {
83
+ "mode": "REGIMEN_OPT",
84
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
85
+ "target_drug": "opioid_like",
86
+ "replacement_drug": "non_opioid_analgesic",
87
+ "dose_bucket": "NA",
88
+ "taper_days": null,
89
+ "monitoring_plan": null,
90
+ "evidence_query": null,
91
+ "new_drug_name": null,
92
+ "candidate_components": [],
93
+ "candidate_id": "cand_04",
94
+ "confidence": 0.74,
95
+ "rationale_brief": "Beam-search(3) top candidate."
96
+ },
97
+ "baseline_policy": "no_change_candidate",
98
+ "episodes": 8,
99
+ "avg_reward": 0.747,
100
+ "legality_rate": 1.0,
101
+ "success_rate": 0.0,
102
+ "policy_stack_ablations": {
103
+ "bandit-only": {
104
+ "avg_reward": 0.7616666666666667,
105
+ "legality_rate": 1.0,
106
+ "steps": 3.0
107
+ },
108
+ "llm-only": {
109
+ "avg_reward": 0.7753333333333333,
110
+ "legality_rate": 1.0,
111
+ "steps": 3.0
112
+ },
113
+ "llm+bandit": {
114
+ "avg_reward": 0.7753333333333333,
115
+ "legality_rate": 1.0,
116
+ "steps": 3.0
117
+ }
118
+ }
119
+ }
outputs/reports/benchmark_report.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "offline_policy_eval": {
3
+ "avg_reward": 0.772833,
4
+ "legal_rate": 1.0,
5
+ "success_rate": 0.0
6
+ },
7
+ "safety_eval": {
8
+ "severe_violation_rate": 0.0,
9
+ "illegal_step_rate": 0.0
10
+ },
11
+ "dosing_eval": {
12
+ "target_attainment": 0.75,
13
+ "toxicity_avoidance": 1.0
14
+ },
15
+ "robustness_eval": {
16
+ "missing_labs_safety_rate": 0.666667,
17
+ "noisy_dose_info_safety_rate": 1.0,
18
+ "conflicting_meds_safety_rate": 1.0,
19
+ "alias_noise_safety_rate": 1.0,
20
+ "hidden_duplicate_detection_rate": 1.0,
21
+ "wrong_candidate_id_resilience": 1.0,
22
+ "stale_evidence_safety_rate": 1.0,
23
+ "delayed_ade_manifestation_safety_rate": 1.0
24
+ },
25
+ "calibration_eval": {
26
+ "ece_proxy": 0.08625
27
+ },
28
+ "abstention_eval": {
29
+ "appropriate_abstention_rate": 0.0
30
+ },
31
+ "process_eval": {
32
+ "process_fidelity": 0.92,
33
+ "avg_invalid_actions": 0.333333
34
+ },
35
+ "subgroup_eval": {
36
+ "renal_compromise": {
37
+ "avg_reward": 0.774,
38
+ "legal_rate": 1.0
39
+ },
40
+ "hepatic_compromise": {
41
+ "avg_reward": 0.779333,
42
+ "legal_rate": 1.0
43
+ },
44
+ "frail": {
45
+ "avg_reward": 0.781667,
46
+ "legal_rate": 1.0
47
+ }
48
+ },
49
+ "explainability_eval": {
50
+ "grounding_rate": 0.8
51
+ }
52
+ }
outputs/reports/benchmark_report.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "offline_policy_eval": {
3
+ "avg_reward": 0.772833,
4
+ "legal_rate": 1.0,
5
+ "success_rate": 0.0
6
+ },
7
+ "safety_eval": {
8
+ "severe_violation_rate": 0.0,
9
+ "illegal_step_rate": 0.0
10
+ },
11
+ "dosing_eval": {
12
+ "target_attainment": 0.75,
13
+ "toxicity_avoidance": 1.0
14
+ },
15
+ "robustness_eval": {
16
+ "missing_labs_safety_rate": 0.666667,
17
+ "noisy_dose_info_safety_rate": 1.0,
18
+ "conflicting_meds_safety_rate": 1.0,
19
+ "alias_noise_safety_rate": 1.0,
20
+ "hidden_duplicate_detection_rate": 1.0,
21
+ "wrong_candidate_id_resilience": 1.0,
22
+ "stale_evidence_safety_rate": 1.0,
23
+ "delayed_ade_manifestation_safety_rate": 1.0
24
+ },
25
+ "calibration_eval": {
26
+ "ece_proxy": 0.08625
27
+ },
28
+ "abstention_eval": {
29
+ "appropriate_abstention_rate": 0.0
30
+ },
31
+ "process_eval": {
32
+ "process_fidelity": 0.92,
33
+ "avg_invalid_actions": 0.333333
34
+ },
35
+ "subgroup_eval": {
36
+ "renal_compromise": {
37
+ "avg_reward": 0.774,
38
+ "legal_rate": 1.0
39
+ },
40
+ "hepatic_compromise": {
41
+ "avg_reward": 0.779333,
42
+ "legal_rate": 1.0
43
+ },
44
+ "frail": {
45
+ "avg_reward": 0.781667,
46
+ "legal_rate": 1.0
47
+ }
48
+ },
49
+ "explainability_eval": {
50
+ "grounding_rate": 0.8
51
+ }
52
+ }
outputs/reports/hf_sweep_summary.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "training_mode": "full",
4
+ "completed_models": 1,
5
+ "failed_or_skipped_models": 0,
6
+ "models": [
7
+ {
8
+ "run_id": "qwen-qwen2-5-3b-instruct",
9
+ "training_mode": "full",
10
+ "model_id": "Qwen/Qwen2.5-3B-Instruct",
11
+ "label": "Qwen2.5-3B",
12
+ "status": "completed",
13
+ "error": "",
14
+ "sft_backend": "trl_transformers",
15
+ "sft_examples": 2000,
16
+ "sft_train_loss": 0.15688225453009363,
17
+ "sft_runtime": 715.2908,
18
+ "grpo_backend": "trl_transformers",
19
+ "grpo_records": 2000,
20
+ "grpo_avg_reward": 0.767,
21
+ "sft_inference_reward": 0.781,
22
+ "sft_valid_rate": 1.0,
23
+ "sft_latency_seconds": 2.863,
24
+ "grpo_inference_reward": 0.726,
25
+ "grpo_valid_rate": 1.0,
26
+ "grpo_latency_seconds": 3.681,
27
+ "train_holdout_gap": 0.041,
28
+ "fallback_detected": false,
29
+ "reward_range_ok": true,
30
+ "reward_range_failures": [],
31
+ "exploit_rate": 0.411,
32
+ "legal_rate": 0.93,
33
+ "candidate_diversity": 0.003,
34
+ "top_candidate_rate": 0.668,
35
+ "reward_components": {
36
+ "format_compliance_score": 0.999,
37
+ "candidate_alignment_score": 0.999,
38
+ "legality_score": 0.929,
39
+ "safety_delta_score": 0.497,
40
+ "burden_improvement_score": 0.469,
41
+ "disease_stability_score": 0.861,
42
+ "dosing_quality_score": 0.526,
43
+ "abstention_quality_score": 0.56,
44
+ "efficiency_score": 0.849,
45
+ "process_fidelity_score": 0.856,
46
+ "explanation_grounding_score": 0.795,
47
+ "anti_cheat_score": 0.589,
48
+ "uncertainty_calibration_score": 0.747
49
+ },
50
+ "primary_reward_channels": {
51
+ "safety_legality": 0.816,
52
+ "clinical_improvement": 0.609,
53
+ "dosing_quality": 0.543,
54
+ "process_integrity": 0.875
55
+ },
56
+ "artifact_paths": {
57
+ "sft": "/app/checkpoints/sweeps/qwen-qwen2-5-3b-instruct/sft_adapter",
58
+ "grpo": "/app/checkpoints/sweeps/qwen-qwen2-5-3b-instruct/grpo_adapter"
59
+ }
60
+ }
61
+ ],
62
+ "charts": {
63
+ "sft_vs_grpo_reward": "outputs/plots/sft_vs_grpo_reward.png",
64
+ "sft_loss_curves": "outputs/plots/sft_loss_curves.png",
65
+ "qwen_model_sft_reward": "outputs/plots/qwen_model_sft_reward.png",
66
+ "qwen_model_sft_loss": "outputs/plots/qwen_model_sft_loss.png",
67
+ "sft_validity_reward": "outputs/plots/sft_validity_reward.png",
68
+ "grpo_reward_curves": "outputs/plots/grpo_reward_curves.png",
69
+ "qwen_model_grpo_reward": "outputs/plots/qwen_model_grpo_reward.png",
70
+ "reward_component_bars": "outputs/plots/reward_component_bars.png",
71
+ "anti_cheat_failure_rates": "outputs/plots/anti_cheat_failure_rates.png",
72
+ "train_holdout_gap": "outputs/plots/train_holdout_gap.png",
73
+ "inference_validity_reward": "outputs/plots/inference_validity_reward.png",
74
+ "inference_latency_validity": "outputs/plots/inference_latency_validity.png"
75
+ }
76
+ }
outputs/reports/hf_training_status.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "status": "running",
3
  "started_at": 1777180786.0648105,
4
- "finished_at": null,
5
  "commands": [
6
  {
7
  "args": [
@@ -218,5 +218,10 @@
218
  "training_mode": "full",
219
  "model_sweep": [
220
  "Qwen/Qwen2.5-3B-Instruct"
 
 
 
 
 
221
  ]
222
  }
 
1
  {
2
+ "status": "ok",
3
  "started_at": 1777180786.0648105,
4
+ "finished_at": 1777188659.441074,
5
  "commands": [
6
  {
7
  "args": [
 
218
  "training_mode": "full",
219
  "model_sweep": [
220
  "Qwen/Qwen2.5-3B-Instruct"
221
+ ],
222
+ "improved": true,
223
+ "anti_hacking_passed": false,
224
+ "completed_run_ids": [
225
+ "qwen-qwen2-5-3b-instruct"
226
  ]
227
  }
outputs/reports/improvement_report.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "baseline": "outputs/reports/baselines.json",
4
+ "candidate": "outputs/reports/benchmark_report.json",
5
+ "deltas": {
6
+ "avg_reward": 0.025833,
7
+ "legality_rate": 0.0,
8
+ "success_rate": 0.0,
9
+ "avg_process_fidelity": 0.92,
10
+ "timeout_rate": 0.0,
11
+ "failure_visible_rate": 0.0
12
+ },
13
+ "gate": {
14
+ "avg_reward_up": true,
15
+ "legality_up": true,
16
+ "success_up": true
17
+ },
18
+ "improved": true
19
+ }
outputs/reports/inference_benchmark.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "runs": [
4
+ {
5
+ "run": 0,
6
+ "provider": "transformers_ranker_fallback",
7
+ "candidate_id": "cand_04",
8
+ "latency_ms": 1751.989,
9
+ "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
10
+ },
11
+ {
12
+ "run": 1,
13
+ "provider": "transformers_ranker_fallback",
14
+ "candidate_id": "cand_02",
15
+ "latency_ms": 0.166,
16
+ "rationale": "Transformers fallback selected cand_02 via local ranker; active_model_enabled=False; active_model_available=False."
17
+ },
18
+ {
19
+ "run": 2,
20
+ "provider": "transformers_ranker_fallback",
21
+ "candidate_id": "cand_04",
22
+ "latency_ms": 0.157,
23
+ "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
24
+ },
25
+ {
26
+ "run": 3,
27
+ "provider": "transformers_ranker_fallback",
28
+ "candidate_id": "cand_04",
29
+ "latency_ms": 0.164,
30
+ "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
31
+ },
32
+ {
33
+ "run": 4,
34
+ "provider": "transformers_ranker_fallback",
35
+ "candidate_id": "cand_04",
36
+ "latency_ms": 0.153,
37
+ "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
38
+ }
39
+ ],
40
+ "avg_latency_ms": 350.526,
41
+ "provider_requested": "transformers",
42
+ "model": "Qwen/Qwen2.5-0.5B-Instruct"
43
+ }
outputs/reports/plot_index.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "plots": [
3
+ "/app/outputs/plots/avg_reward.png",
4
+ "/app/outputs/plots/legality_rate.png",
5
+ "/app/outputs/plots/success_rate.png",
6
+ "/app/outputs/plots/avg_process_fidelity.png",
7
+ "/app/outputs/plots/policy_stack_avg_reward.png"
8
+ ]
9
+ }
outputs/reports/robustness.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "missing_labs_safety_rate": 0.666667,
3
+ "noisy_dose_info_safety_rate": 1.0,
4
+ "conflicting_meds_safety_rate": 1.0,
5
+ "alias_noise_safety_rate": 1.0,
6
+ "hidden_duplicate_detection_rate": 1.0,
7
+ "wrong_candidate_id_resilience": 1.0,
8
+ "stale_evidence_safety_rate": 1.0,
9
+ "delayed_ade_manifestation_safety_rate": 1.0
10
+ }