Upload PolyGuard training artifacts: outputs/reports
Browse files- outputs/reports/anti_hacking_overfit_report.json +22 -0
- outputs/reports/baselines.json +119 -0
- outputs/reports/benchmark_report.json +52 -0
- outputs/reports/benchmark_report.txt +52 -0
- outputs/reports/hf_sweep_summary.json +76 -0
- outputs/reports/hf_training_status.json +7 -2
- outputs/reports/improvement_report.json +19 -0
- outputs/reports/inference_benchmark.json +43 -0
- outputs/reports/plot_index.json +9 -0
- outputs/reports/robustness.json +10 -0
outputs/reports/anti_hacking_overfit_report.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"passed": false,
|
| 3 |
+
"training_mode": "full",
|
| 4 |
+
"warnings": [
|
| 5 |
+
"Qwen2.5-3B:high_exploit_rate"
|
| 6 |
+
],
|
| 7 |
+
"completed_models": [
|
| 8 |
+
"Qwen/Qwen2.5-3B-Instruct"
|
| 9 |
+
],
|
| 10 |
+
"failed_or_skipped_models": [],
|
| 11 |
+
"checks": {
|
| 12 |
+
"reward_bounds": [
|
| 13 |
+
0.001,
|
| 14 |
+
0.999
|
| 15 |
+
],
|
| 16 |
+
"reward_precision": 3,
|
| 17 |
+
"fallback_backends_rejected": true,
|
| 18 |
+
"exploit_rate_threshold": 0.35,
|
| 19 |
+
"train_holdout_gap_threshold": 0.25,
|
| 20 |
+
"min_validity_rate": 0.8
|
| 21 |
+
}
|
| 22 |
+
}
|
outputs/reports/baselines.json
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"no_change": {
|
| 3 |
+
"mode": "REGIMEN_OPT",
|
| 4 |
+
"action_type": "KEEP_REGIMEN",
|
| 5 |
+
"target_drug": null,
|
| 6 |
+
"replacement_drug": null,
|
| 7 |
+
"dose_bucket": "NA",
|
| 8 |
+
"taper_days": null,
|
| 9 |
+
"monitoring_plan": null,
|
| 10 |
+
"evidence_query": null,
|
| 11 |
+
"new_drug_name": null,
|
| 12 |
+
"candidate_components": [],
|
| 13 |
+
"candidate_id": "cand_01",
|
| 14 |
+
"confidence": 0.8,
|
| 15 |
+
"rationale_brief": "Baseline no-change policy."
|
| 16 |
+
},
|
| 17 |
+
"rules_only": {
|
| 18 |
+
"mode": "REGIMEN_OPT",
|
| 19 |
+
"action_type": "SUBSTITUTE_WITHIN_CLASS",
|
| 20 |
+
"target_drug": "opioid_like",
|
| 21 |
+
"replacement_drug": "non_opioid_analgesic",
|
| 22 |
+
"dose_bucket": "NA",
|
| 23 |
+
"taper_days": null,
|
| 24 |
+
"monitoring_plan": null,
|
| 25 |
+
"evidence_query": null,
|
| 26 |
+
"new_drug_name": null,
|
| 27 |
+
"candidate_components": [],
|
| 28 |
+
"candidate_id": "cand_04",
|
| 29 |
+
"confidence": 0.75,
|
| 30 |
+
"rationale_brief": "Rules-only selected top legal candidate."
|
| 31 |
+
},
|
| 32 |
+
"greedy": {
|
| 33 |
+
"mode": "REGIMEN_OPT",
|
| 34 |
+
"action_type": "SUBSTITUTE_WITHIN_CLASS",
|
| 35 |
+
"target_drug": "opioid_like",
|
| 36 |
+
"replacement_drug": "non_opioid_analgesic",
|
| 37 |
+
"dose_bucket": "NA",
|
| 38 |
+
"taper_days": null,
|
| 39 |
+
"monitoring_plan": null,
|
| 40 |
+
"evidence_query": null,
|
| 41 |
+
"new_drug_name": null,
|
| 42 |
+
"candidate_components": [],
|
| 43 |
+
"candidate_id": "cand_04",
|
| 44 |
+
"confidence": 0.72,
|
| 45 |
+
"rationale_brief": "Greedy safety/burden improvement baseline."
|
| 46 |
+
},
|
| 47 |
+
"contextual_bandit": {
|
| 48 |
+
"mode": "REGIMEN_OPT",
|
| 49 |
+
"action_type": "SUBSTITUTE_WITHIN_CLASS",
|
| 50 |
+
"target_drug": "opioid_like",
|
| 51 |
+
"replacement_drug": "non_opioid_analgesic",
|
| 52 |
+
"dose_bucket": "NA",
|
| 53 |
+
"taper_days": null,
|
| 54 |
+
"monitoring_plan": null,
|
| 55 |
+
"evidence_query": null,
|
| 56 |
+
"new_drug_name": null,
|
| 57 |
+
"candidate_components": [],
|
| 58 |
+
"candidate_id": "cand_04",
|
| 59 |
+
"confidence": 0.68,
|
| 60 |
+
"rationale_brief": "Contextual bandit selected candidate."
|
| 61 |
+
},
|
| 62 |
+
"contextual_bandit_topk": [
|
| 63 |
+
{
|
| 64 |
+
"candidate_id": "cand_09",
|
| 65 |
+
"score": 1.1532307878304324,
|
| 66 |
+
"exploration_bonus": 1.1532307878304324,
|
| 67 |
+
"algorithm": "linucb"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"candidate_id": "cand_10",
|
| 71 |
+
"score": 1.1489735636645433,
|
| 72 |
+
"exploration_bonus": 1.1489735636645433,
|
| 73 |
+
"algorithm": "linucb"
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"candidate_id": "cand_08",
|
| 77 |
+
"score": 1.1447401451857973,
|
| 78 |
+
"exploration_bonus": 1.1447401451857973,
|
| 79 |
+
"algorithm": "linucb"
|
| 80 |
+
}
|
| 81 |
+
],
|
| 82 |
+
"beam_search": {
|
| 83 |
+
"mode": "REGIMEN_OPT",
|
| 84 |
+
"action_type": "SUBSTITUTE_WITHIN_CLASS",
|
| 85 |
+
"target_drug": "opioid_like",
|
| 86 |
+
"replacement_drug": "non_opioid_analgesic",
|
| 87 |
+
"dose_bucket": "NA",
|
| 88 |
+
"taper_days": null,
|
| 89 |
+
"monitoring_plan": null,
|
| 90 |
+
"evidence_query": null,
|
| 91 |
+
"new_drug_name": null,
|
| 92 |
+
"candidate_components": [],
|
| 93 |
+
"candidate_id": "cand_04",
|
| 94 |
+
"confidence": 0.74,
|
| 95 |
+
"rationale_brief": "Beam-search(3) top candidate."
|
| 96 |
+
},
|
| 97 |
+
"baseline_policy": "no_change_candidate",
|
| 98 |
+
"episodes": 8,
|
| 99 |
+
"avg_reward": 0.747,
|
| 100 |
+
"legality_rate": 1.0,
|
| 101 |
+
"success_rate": 0.0,
|
| 102 |
+
"policy_stack_ablations": {
|
| 103 |
+
"bandit-only": {
|
| 104 |
+
"avg_reward": 0.7616666666666667,
|
| 105 |
+
"legality_rate": 1.0,
|
| 106 |
+
"steps": 3.0
|
| 107 |
+
},
|
| 108 |
+
"llm-only": {
|
| 109 |
+
"avg_reward": 0.7753333333333333,
|
| 110 |
+
"legality_rate": 1.0,
|
| 111 |
+
"steps": 3.0
|
| 112 |
+
},
|
| 113 |
+
"llm+bandit": {
|
| 114 |
+
"avg_reward": 0.7753333333333333,
|
| 115 |
+
"legality_rate": 1.0,
|
| 116 |
+
"steps": 3.0
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
}
|
outputs/reports/benchmark_report.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"offline_policy_eval": {
|
| 3 |
+
"avg_reward": 0.772833,
|
| 4 |
+
"legal_rate": 1.0,
|
| 5 |
+
"success_rate": 0.0
|
| 6 |
+
},
|
| 7 |
+
"safety_eval": {
|
| 8 |
+
"severe_violation_rate": 0.0,
|
| 9 |
+
"illegal_step_rate": 0.0
|
| 10 |
+
},
|
| 11 |
+
"dosing_eval": {
|
| 12 |
+
"target_attainment": 0.75,
|
| 13 |
+
"toxicity_avoidance": 1.0
|
| 14 |
+
},
|
| 15 |
+
"robustness_eval": {
|
| 16 |
+
"missing_labs_safety_rate": 0.666667,
|
| 17 |
+
"noisy_dose_info_safety_rate": 1.0,
|
| 18 |
+
"conflicting_meds_safety_rate": 1.0,
|
| 19 |
+
"alias_noise_safety_rate": 1.0,
|
| 20 |
+
"hidden_duplicate_detection_rate": 1.0,
|
| 21 |
+
"wrong_candidate_id_resilience": 1.0,
|
| 22 |
+
"stale_evidence_safety_rate": 1.0,
|
| 23 |
+
"delayed_ade_manifestation_safety_rate": 1.0
|
| 24 |
+
},
|
| 25 |
+
"calibration_eval": {
|
| 26 |
+
"ece_proxy": 0.08625
|
| 27 |
+
},
|
| 28 |
+
"abstention_eval": {
|
| 29 |
+
"appropriate_abstention_rate": 0.0
|
| 30 |
+
},
|
| 31 |
+
"process_eval": {
|
| 32 |
+
"process_fidelity": 0.92,
|
| 33 |
+
"avg_invalid_actions": 0.333333
|
| 34 |
+
},
|
| 35 |
+
"subgroup_eval": {
|
| 36 |
+
"renal_compromise": {
|
| 37 |
+
"avg_reward": 0.774,
|
| 38 |
+
"legal_rate": 1.0
|
| 39 |
+
},
|
| 40 |
+
"hepatic_compromise": {
|
| 41 |
+
"avg_reward": 0.779333,
|
| 42 |
+
"legal_rate": 1.0
|
| 43 |
+
},
|
| 44 |
+
"frail": {
|
| 45 |
+
"avg_reward": 0.781667,
|
| 46 |
+
"legal_rate": 1.0
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"explainability_eval": {
|
| 50 |
+
"grounding_rate": 0.8
|
| 51 |
+
}
|
| 52 |
+
}
|
outputs/reports/benchmark_report.txt
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"offline_policy_eval": {
|
| 3 |
+
"avg_reward": 0.772833,
|
| 4 |
+
"legal_rate": 1.0,
|
| 5 |
+
"success_rate": 0.0
|
| 6 |
+
},
|
| 7 |
+
"safety_eval": {
|
| 8 |
+
"severe_violation_rate": 0.0,
|
| 9 |
+
"illegal_step_rate": 0.0
|
| 10 |
+
},
|
| 11 |
+
"dosing_eval": {
|
| 12 |
+
"target_attainment": 0.75,
|
| 13 |
+
"toxicity_avoidance": 1.0
|
| 14 |
+
},
|
| 15 |
+
"robustness_eval": {
|
| 16 |
+
"missing_labs_safety_rate": 0.666667,
|
| 17 |
+
"noisy_dose_info_safety_rate": 1.0,
|
| 18 |
+
"conflicting_meds_safety_rate": 1.0,
|
| 19 |
+
"alias_noise_safety_rate": 1.0,
|
| 20 |
+
"hidden_duplicate_detection_rate": 1.0,
|
| 21 |
+
"wrong_candidate_id_resilience": 1.0,
|
| 22 |
+
"stale_evidence_safety_rate": 1.0,
|
| 23 |
+
"delayed_ade_manifestation_safety_rate": 1.0
|
| 24 |
+
},
|
| 25 |
+
"calibration_eval": {
|
| 26 |
+
"ece_proxy": 0.08625
|
| 27 |
+
},
|
| 28 |
+
"abstention_eval": {
|
| 29 |
+
"appropriate_abstention_rate": 0.0
|
| 30 |
+
},
|
| 31 |
+
"process_eval": {
|
| 32 |
+
"process_fidelity": 0.92,
|
| 33 |
+
"avg_invalid_actions": 0.333333
|
| 34 |
+
},
|
| 35 |
+
"subgroup_eval": {
|
| 36 |
+
"renal_compromise": {
|
| 37 |
+
"avg_reward": 0.774,
|
| 38 |
+
"legal_rate": 1.0
|
| 39 |
+
},
|
| 40 |
+
"hepatic_compromise": {
|
| 41 |
+
"avg_reward": 0.779333,
|
| 42 |
+
"legal_rate": 1.0
|
| 43 |
+
},
|
| 44 |
+
"frail": {
|
| 45 |
+
"avg_reward": 0.781667,
|
| 46 |
+
"legal_rate": 1.0
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"explainability_eval": {
|
| 50 |
+
"grounding_rate": 0.8
|
| 51 |
+
}
|
| 52 |
+
}
|
outputs/reports/hf_sweep_summary.json
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"status": "ok",
|
| 3 |
+
"training_mode": "full",
|
| 4 |
+
"completed_models": 1,
|
| 5 |
+
"failed_or_skipped_models": 0,
|
| 6 |
+
"models": [
|
| 7 |
+
{
|
| 8 |
+
"run_id": "qwen-qwen2-5-3b-instruct",
|
| 9 |
+
"training_mode": "full",
|
| 10 |
+
"model_id": "Qwen/Qwen2.5-3B-Instruct",
|
| 11 |
+
"label": "Qwen2.5-3B",
|
| 12 |
+
"status": "completed",
|
| 13 |
+
"error": "",
|
| 14 |
+
"sft_backend": "trl_transformers",
|
| 15 |
+
"sft_examples": 2000,
|
| 16 |
+
"sft_train_loss": 0.15688225453009363,
|
| 17 |
+
"sft_runtime": 715.2908,
|
| 18 |
+
"grpo_backend": "trl_transformers",
|
| 19 |
+
"grpo_records": 2000,
|
| 20 |
+
"grpo_avg_reward": 0.767,
|
| 21 |
+
"sft_inference_reward": 0.781,
|
| 22 |
+
"sft_valid_rate": 1.0,
|
| 23 |
+
"sft_latency_seconds": 2.863,
|
| 24 |
+
"grpo_inference_reward": 0.726,
|
| 25 |
+
"grpo_valid_rate": 1.0,
|
| 26 |
+
"grpo_latency_seconds": 3.681,
|
| 27 |
+
"train_holdout_gap": 0.041,
|
| 28 |
+
"fallback_detected": false,
|
| 29 |
+
"reward_range_ok": true,
|
| 30 |
+
"reward_range_failures": [],
|
| 31 |
+
"exploit_rate": 0.411,
|
| 32 |
+
"legal_rate": 0.93,
|
| 33 |
+
"candidate_diversity": 0.003,
|
| 34 |
+
"top_candidate_rate": 0.668,
|
| 35 |
+
"reward_components": {
|
| 36 |
+
"format_compliance_score": 0.999,
|
| 37 |
+
"candidate_alignment_score": 0.999,
|
| 38 |
+
"legality_score": 0.929,
|
| 39 |
+
"safety_delta_score": 0.497,
|
| 40 |
+
"burden_improvement_score": 0.469,
|
| 41 |
+
"disease_stability_score": 0.861,
|
| 42 |
+
"dosing_quality_score": 0.526,
|
| 43 |
+
"abstention_quality_score": 0.56,
|
| 44 |
+
"efficiency_score": 0.849,
|
| 45 |
+
"process_fidelity_score": 0.856,
|
| 46 |
+
"explanation_grounding_score": 0.795,
|
| 47 |
+
"anti_cheat_score": 0.589,
|
| 48 |
+
"uncertainty_calibration_score": 0.747
|
| 49 |
+
},
|
| 50 |
+
"primary_reward_channels": {
|
| 51 |
+
"safety_legality": 0.816,
|
| 52 |
+
"clinical_improvement": 0.609,
|
| 53 |
+
"dosing_quality": 0.543,
|
| 54 |
+
"process_integrity": 0.875
|
| 55 |
+
},
|
| 56 |
+
"artifact_paths": {
|
| 57 |
+
"sft": "/app/checkpoints/sweeps/qwen-qwen2-5-3b-instruct/sft_adapter",
|
| 58 |
+
"grpo": "/app/checkpoints/sweeps/qwen-qwen2-5-3b-instruct/grpo_adapter"
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
],
|
| 62 |
+
"charts": {
|
| 63 |
+
"sft_vs_grpo_reward": "outputs/plots/sft_vs_grpo_reward.png",
|
| 64 |
+
"sft_loss_curves": "outputs/plots/sft_loss_curves.png",
|
| 65 |
+
"qwen_model_sft_reward": "outputs/plots/qwen_model_sft_reward.png",
|
| 66 |
+
"qwen_model_sft_loss": "outputs/plots/qwen_model_sft_loss.png",
|
| 67 |
+
"sft_validity_reward": "outputs/plots/sft_validity_reward.png",
|
| 68 |
+
"grpo_reward_curves": "outputs/plots/grpo_reward_curves.png",
|
| 69 |
+
"qwen_model_grpo_reward": "outputs/plots/qwen_model_grpo_reward.png",
|
| 70 |
+
"reward_component_bars": "outputs/plots/reward_component_bars.png",
|
| 71 |
+
"anti_cheat_failure_rates": "outputs/plots/anti_cheat_failure_rates.png",
|
| 72 |
+
"train_holdout_gap": "outputs/plots/train_holdout_gap.png",
|
| 73 |
+
"inference_validity_reward": "outputs/plots/inference_validity_reward.png",
|
| 74 |
+
"inference_latency_validity": "outputs/plots/inference_latency_validity.png"
|
| 75 |
+
}
|
| 76 |
+
}
|
outputs/reports/hf_training_status.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"status": "
|
| 3 |
"started_at": 1777180786.0648105,
|
| 4 |
-
"finished_at":
|
| 5 |
"commands": [
|
| 6 |
{
|
| 7 |
"args": [
|
|
@@ -218,5 +218,10 @@
|
|
| 218 |
"training_mode": "full",
|
| 219 |
"model_sweep": [
|
| 220 |
"Qwen/Qwen2.5-3B-Instruct"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
]
|
| 222 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"status": "ok",
|
| 3 |
"started_at": 1777180786.0648105,
|
| 4 |
+
"finished_at": 1777188659.441074,
|
| 5 |
"commands": [
|
| 6 |
{
|
| 7 |
"args": [
|
|
|
|
| 218 |
"training_mode": "full",
|
| 219 |
"model_sweep": [
|
| 220 |
"Qwen/Qwen2.5-3B-Instruct"
|
| 221 |
+
],
|
| 222 |
+
"improved": true,
|
| 223 |
+
"anti_hacking_passed": false,
|
| 224 |
+
"completed_run_ids": [
|
| 225 |
+
"qwen-qwen2-5-3b-instruct"
|
| 226 |
]
|
| 227 |
}
|
outputs/reports/improvement_report.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"status": "ok",
|
| 3 |
+
"baseline": "outputs/reports/baselines.json",
|
| 4 |
+
"candidate": "outputs/reports/benchmark_report.json",
|
| 5 |
+
"deltas": {
|
| 6 |
+
"avg_reward": 0.025833,
|
| 7 |
+
"legality_rate": 0.0,
|
| 8 |
+
"success_rate": 0.0,
|
| 9 |
+
"avg_process_fidelity": 0.92,
|
| 10 |
+
"timeout_rate": 0.0,
|
| 11 |
+
"failure_visible_rate": 0.0
|
| 12 |
+
},
|
| 13 |
+
"gate": {
|
| 14 |
+
"avg_reward_up": true,
|
| 15 |
+
"legality_up": true,
|
| 16 |
+
"success_up": true
|
| 17 |
+
},
|
| 18 |
+
"improved": true
|
| 19 |
+
}
|
outputs/reports/inference_benchmark.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"status": "ok",
|
| 3 |
+
"runs": [
|
| 4 |
+
{
|
| 5 |
+
"run": 0,
|
| 6 |
+
"provider": "transformers_ranker_fallback",
|
| 7 |
+
"candidate_id": "cand_04",
|
| 8 |
+
"latency_ms": 1751.989,
|
| 9 |
+
"rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"run": 1,
|
| 13 |
+
"provider": "transformers_ranker_fallback",
|
| 14 |
+
"candidate_id": "cand_02",
|
| 15 |
+
"latency_ms": 0.166,
|
| 16 |
+
"rationale": "Transformers fallback selected cand_02 via local ranker; active_model_enabled=False; active_model_available=False."
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"run": 2,
|
| 20 |
+
"provider": "transformers_ranker_fallback",
|
| 21 |
+
"candidate_id": "cand_04",
|
| 22 |
+
"latency_ms": 0.157,
|
| 23 |
+
"rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"run": 3,
|
| 27 |
+
"provider": "transformers_ranker_fallback",
|
| 28 |
+
"candidate_id": "cand_04",
|
| 29 |
+
"latency_ms": 0.164,
|
| 30 |
+
"rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"run": 4,
|
| 34 |
+
"provider": "transformers_ranker_fallback",
|
| 35 |
+
"candidate_id": "cand_04",
|
| 36 |
+
"latency_ms": 0.153,
|
| 37 |
+
"rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
|
| 38 |
+
}
|
| 39 |
+
],
|
| 40 |
+
"avg_latency_ms": 350.526,
|
| 41 |
+
"provider_requested": "transformers",
|
| 42 |
+
"model": "Qwen/Qwen2.5-0.5B-Instruct"
|
| 43 |
+
}
|
outputs/reports/plot_index.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"plots": [
|
| 3 |
+
"/app/outputs/plots/avg_reward.png",
|
| 4 |
+
"/app/outputs/plots/legality_rate.png",
|
| 5 |
+
"/app/outputs/plots/success_rate.png",
|
| 6 |
+
"/app/outputs/plots/avg_process_fidelity.png",
|
| 7 |
+
"/app/outputs/plots/policy_stack_avg_reward.png"
|
| 8 |
+
]
|
| 9 |
+
}
|
outputs/reports/robustness.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"missing_labs_safety_rate": 0.666667,
|
| 3 |
+
"noisy_dose_info_safety_rate": 1.0,
|
| 4 |
+
"conflicting_meds_safety_rate": 1.0,
|
| 5 |
+
"alias_noise_safety_rate": 1.0,
|
| 6 |
+
"hidden_duplicate_detection_rate": 1.0,
|
| 7 |
+
"wrong_candidate_id_resilience": 1.0,
|
| 8 |
+
"stale_evidence_safety_rate": 1.0,
|
| 9 |
+
"delayed_ade_manifestation_safety_rate": 1.0
|
| 10 |
+
}
|