Upload PolyGuard training artifacts: outputs/reports

Browse files

Files changed (10) hide show

outputs/reports/anti_hacking_overfit_report.json +22 -0
outputs/reports/baselines.json +119 -0
outputs/reports/benchmark_report.json +52 -0
outputs/reports/benchmark_report.txt +52 -0
outputs/reports/hf_sweep_summary.json +76 -0
outputs/reports/hf_training_status.json +7 -2
outputs/reports/improvement_report.json +19 -0
outputs/reports/inference_benchmark.json +43 -0
outputs/reports/plot_index.json +9 -0
outputs/reports/robustness.json +10 -0

outputs/reports/anti_hacking_overfit_report.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "passed": false,
+  "training_mode": "full",
+  "warnings": [
+    "Qwen2.5-3B:high_exploit_rate"
+  ],
+  "completed_models": [
+    "Qwen/Qwen2.5-3B-Instruct"
+  ],
+  "failed_or_skipped_models": [],
+  "checks": {
+    "reward_bounds": [
+      0.001,
+      0.999
+    ],
+    "reward_precision": 3,
+    "fallback_backends_rejected": true,
+    "exploit_rate_threshold": 0.35,
+    "train_holdout_gap_threshold": 0.25,
+    "min_validity_rate": 0.8
+  }
+}

outputs/reports/baselines.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "no_change": {
+    "mode": "REGIMEN_OPT",
+    "action_type": "KEEP_REGIMEN",
+    "target_drug": null,
+    "replacement_drug": null,
+    "dose_bucket": "NA",
+    "taper_days": null,
+    "monitoring_plan": null,
+    "evidence_query": null,
+    "new_drug_name": null,
+    "candidate_components": [],
+    "candidate_id": "cand_01",
+    "confidence": 0.8,
+    "rationale_brief": "Baseline no-change policy."
+  },
+  "rules_only": {
+    "mode": "REGIMEN_OPT",
+    "action_type": "SUBSTITUTE_WITHIN_CLASS",
+    "target_drug": "opioid_like",
+    "replacement_drug": "non_opioid_analgesic",
+    "dose_bucket": "NA",
+    "taper_days": null,
+    "monitoring_plan": null,
+    "evidence_query": null,
+    "new_drug_name": null,
+    "candidate_components": [],
+    "candidate_id": "cand_04",
+    "confidence": 0.75,
+    "rationale_brief": "Rules-only selected top legal candidate."
+  },
+  "greedy": {
+    "mode": "REGIMEN_OPT",
+    "action_type": "SUBSTITUTE_WITHIN_CLASS",
+    "target_drug": "opioid_like",
+    "replacement_drug": "non_opioid_analgesic",
+    "dose_bucket": "NA",
+    "taper_days": null,
+    "monitoring_plan": null,
+    "evidence_query": null,
+    "new_drug_name": null,
+    "candidate_components": [],
+    "candidate_id": "cand_04",
+    "confidence": 0.72,
+    "rationale_brief": "Greedy safety/burden improvement baseline."
+  },
+  "contextual_bandit": {
+    "mode": "REGIMEN_OPT",
+    "action_type": "SUBSTITUTE_WITHIN_CLASS",
+    "target_drug": "opioid_like",
+    "replacement_drug": "non_opioid_analgesic",
+    "dose_bucket": "NA",
+    "taper_days": null,
+    "monitoring_plan": null,
+    "evidence_query": null,
+    "new_drug_name": null,
+    "candidate_components": [],
+    "candidate_id": "cand_04",
+    "confidence": 0.68,
+    "rationale_brief": "Contextual bandit selected candidate."
+  },
+  "contextual_bandit_topk": [
+    {
+      "candidate_id": "cand_09",
+      "score": 1.1532307878304324,
+      "exploration_bonus": 1.1532307878304324,
+      "algorithm": "linucb"
+    },
+    {
+      "candidate_id": "cand_10",
+      "score": 1.1489735636645433,
+      "exploration_bonus": 1.1489735636645433,
+      "algorithm": "linucb"
+    },
+    {
+      "candidate_id": "cand_08",
+      "score": 1.1447401451857973,
+      "exploration_bonus": 1.1447401451857973,
+      "algorithm": "linucb"
+    }
+  ],
+  "beam_search": {
+    "mode": "REGIMEN_OPT",
+    "action_type": "SUBSTITUTE_WITHIN_CLASS",
+    "target_drug": "opioid_like",
+    "replacement_drug": "non_opioid_analgesic",
+    "dose_bucket": "NA",
+    "taper_days": null,
+    "monitoring_plan": null,
+    "evidence_query": null,
+    "new_drug_name": null,
+    "candidate_components": [],
+    "candidate_id": "cand_04",
+    "confidence": 0.74,
+    "rationale_brief": "Beam-search(3) top candidate."
+  },
+  "baseline_policy": "no_change_candidate",
+  "episodes": 8,
+  "avg_reward": 0.747,
+  "legality_rate": 1.0,
+  "success_rate": 0.0,
+  "policy_stack_ablations": {
+    "bandit-only": {
+      "avg_reward": 0.7616666666666667,
+      "legality_rate": 1.0,
+      "steps": 3.0
+    },
+    "llm-only": {
+      "avg_reward": 0.7753333333333333,
+      "legality_rate": 1.0,
+      "steps": 3.0
+    },
+    "llm+bandit": {
+      "avg_reward": 0.7753333333333333,
+      "legality_rate": 1.0,
+      "steps": 3.0
+    }
+  }
+}

outputs/reports/benchmark_report.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "offline_policy_eval": {
+    "avg_reward": 0.772833,
+    "legal_rate": 1.0,
+    "success_rate": 0.0
+  },
+  "safety_eval": {
+    "severe_violation_rate": 0.0,
+    "illegal_step_rate": 0.0
+  },
+  "dosing_eval": {
+    "target_attainment": 0.75,
+    "toxicity_avoidance": 1.0
+  },
+  "robustness_eval": {
+    "missing_labs_safety_rate": 0.666667,
+    "noisy_dose_info_safety_rate": 1.0,
+    "conflicting_meds_safety_rate": 1.0,
+    "alias_noise_safety_rate": 1.0,
+    "hidden_duplicate_detection_rate": 1.0,
+    "wrong_candidate_id_resilience": 1.0,
+    "stale_evidence_safety_rate": 1.0,
+    "delayed_ade_manifestation_safety_rate": 1.0
+  },
+  "calibration_eval": {
+    "ece_proxy": 0.08625
+  },
+  "abstention_eval": {
+    "appropriate_abstention_rate": 0.0
+  },
+  "process_eval": {
+    "process_fidelity": 0.92,
+    "avg_invalid_actions": 0.333333
+  },
+  "subgroup_eval": {
+    "renal_compromise": {
+      "avg_reward": 0.774,
+      "legal_rate": 1.0
+    },
+    "hepatic_compromise": {
+      "avg_reward": 0.779333,
+      "legal_rate": 1.0
+    },
+    "frail": {
+      "avg_reward": 0.781667,
+      "legal_rate": 1.0
+    }
+  },
+  "explainability_eval": {
+    "grounding_rate": 0.8
+  }
+}

outputs/reports/benchmark_report.txt ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "offline_policy_eval": {
+    "avg_reward": 0.772833,
+    "legal_rate": 1.0,
+    "success_rate": 0.0
+  },
+  "safety_eval": {
+    "severe_violation_rate": 0.0,
+    "illegal_step_rate": 0.0
+  },
+  "dosing_eval": {
+    "target_attainment": 0.75,
+    "toxicity_avoidance": 1.0
+  },
+  "robustness_eval": {
+    "missing_labs_safety_rate": 0.666667,
+    "noisy_dose_info_safety_rate": 1.0,
+    "conflicting_meds_safety_rate": 1.0,
+    "alias_noise_safety_rate": 1.0,
+    "hidden_duplicate_detection_rate": 1.0,
+    "wrong_candidate_id_resilience": 1.0,
+    "stale_evidence_safety_rate": 1.0,
+    "delayed_ade_manifestation_safety_rate": 1.0
+  },
+  "calibration_eval": {
+    "ece_proxy": 0.08625
+  },
+  "abstention_eval": {
+    "appropriate_abstention_rate": 0.0
+  },
+  "process_eval": {
+    "process_fidelity": 0.92,
+    "avg_invalid_actions": 0.333333
+  },
+  "subgroup_eval": {
+    "renal_compromise": {
+      "avg_reward": 0.774,
+      "legal_rate": 1.0
+    },
+    "hepatic_compromise": {
+      "avg_reward": 0.779333,
+      "legal_rate": 1.0
+    },
+    "frail": {
+      "avg_reward": 0.781667,
+      "legal_rate": 1.0
+    }
+  },
+  "explainability_eval": {
+    "grounding_rate": 0.8
+  }
+}

outputs/reports/hf_sweep_summary.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "status": "ok",
+  "training_mode": "full",
+  "completed_models": 1,
+  "failed_or_skipped_models": 0,
+  "models": [
+    {
+      "run_id": "qwen-qwen2-5-3b-instruct",
+      "training_mode": "full",
+      "model_id": "Qwen/Qwen2.5-3B-Instruct",
+      "label": "Qwen2.5-3B",
+      "status": "completed",
+      "error": "",
+      "sft_backend": "trl_transformers",
+      "sft_examples": 2000,
+      "sft_train_loss": 0.15688225453009363,
+      "sft_runtime": 715.2908,
+      "grpo_backend": "trl_transformers",
+      "grpo_records": 2000,
+      "grpo_avg_reward": 0.767,
+      "sft_inference_reward": 0.781,
+      "sft_valid_rate": 1.0,
+      "sft_latency_seconds": 2.863,
+      "grpo_inference_reward": 0.726,
+      "grpo_valid_rate": 1.0,
+      "grpo_latency_seconds": 3.681,
+      "train_holdout_gap": 0.041,
+      "fallback_detected": false,
+      "reward_range_ok": true,
+      "reward_range_failures": [],
+      "exploit_rate": 0.411,
+      "legal_rate": 0.93,
+      "candidate_diversity": 0.003,
+      "top_candidate_rate": 0.668,
+      "reward_components": {
+        "format_compliance_score": 0.999,
+        "candidate_alignment_score": 0.999,
+        "legality_score": 0.929,
+        "safety_delta_score": 0.497,
+        "burden_improvement_score": 0.469,
+        "disease_stability_score": 0.861,
+        "dosing_quality_score": 0.526,
+        "abstention_quality_score": 0.56,
+        "efficiency_score": 0.849,
+        "process_fidelity_score": 0.856,
+        "explanation_grounding_score": 0.795,
+        "anti_cheat_score": 0.589,
+        "uncertainty_calibration_score": 0.747
+      },
+      "primary_reward_channels": {
+        "safety_legality": 0.816,
+        "clinical_improvement": 0.609,
+        "dosing_quality": 0.543,
+        "process_integrity": 0.875
+      },
+      "artifact_paths": {
+        "sft": "/app/checkpoints/sweeps/qwen-qwen2-5-3b-instruct/sft_adapter",
+        "grpo": "/app/checkpoints/sweeps/qwen-qwen2-5-3b-instruct/grpo_adapter"
+      }
+    }
+  ],
+  "charts": {
+    "sft_vs_grpo_reward": "outputs/plots/sft_vs_grpo_reward.png",
+    "sft_loss_curves": "outputs/plots/sft_loss_curves.png",
+    "qwen_model_sft_reward": "outputs/plots/qwen_model_sft_reward.png",
+    "qwen_model_sft_loss": "outputs/plots/qwen_model_sft_loss.png",
+    "sft_validity_reward": "outputs/plots/sft_validity_reward.png",
+    "grpo_reward_curves": "outputs/plots/grpo_reward_curves.png",
+    "qwen_model_grpo_reward": "outputs/plots/qwen_model_grpo_reward.png",
+    "reward_component_bars": "outputs/plots/reward_component_bars.png",
+    "anti_cheat_failure_rates": "outputs/plots/anti_cheat_failure_rates.png",
+    "train_holdout_gap": "outputs/plots/train_holdout_gap.png",
+    "inference_validity_reward": "outputs/plots/inference_validity_reward.png",
+    "inference_latency_validity": "outputs/plots/inference_latency_validity.png"
+  }
+}

outputs/reports/hf_training_status.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "status": "running",
   "started_at": 1777180786.0648105,
-  "finished_at": null,
   "commands": [
     {
       "args": [
@@ -218,5 +218,10 @@
   "training_mode": "full",
   "model_sweep": [
     "Qwen/Qwen2.5-3B-Instruct"
   ]
 }

 {
+  "status": "ok",
   "started_at": 1777180786.0648105,
+  "finished_at": 1777188659.441074,
   "commands": [
     {
       "args": [
   "training_mode": "full",
   "model_sweep": [
     "Qwen/Qwen2.5-3B-Instruct"
+  ],
+  "improved": true,
+  "anti_hacking_passed": false,
+  "completed_run_ids": [
+    "qwen-qwen2-5-3b-instruct"
   ]
 }

outputs/reports/improvement_report.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "status": "ok",
+  "baseline": "outputs/reports/baselines.json",
+  "candidate": "outputs/reports/benchmark_report.json",
+  "deltas": {
+    "avg_reward": 0.025833,
+    "legality_rate": 0.0,
+    "success_rate": 0.0,
+    "avg_process_fidelity": 0.92,
+    "timeout_rate": 0.0,
+    "failure_visible_rate": 0.0
+  },
+  "gate": {
+    "avg_reward_up": true,
+    "legality_up": true,
+    "success_up": true
+  },
+  "improved": true
+}

outputs/reports/inference_benchmark.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "status": "ok",
+  "runs": [
+    {
+      "run": 0,
+      "provider": "transformers_ranker_fallback",
+      "candidate_id": "cand_04",
+      "latency_ms": 1751.989,
+      "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
+    },
+    {
+      "run": 1,
+      "provider": "transformers_ranker_fallback",
+      "candidate_id": "cand_02",
+      "latency_ms": 0.166,
+      "rationale": "Transformers fallback selected cand_02 via local ranker; active_model_enabled=False; active_model_available=False."
+    },
+    {
+      "run": 2,
+      "provider": "transformers_ranker_fallback",
+      "candidate_id": "cand_04",
+      "latency_ms": 0.157,
+      "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
+    },
+    {
+      "run": 3,
+      "provider": "transformers_ranker_fallback",
+      "candidate_id": "cand_04",
+      "latency_ms": 0.164,
+      "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
+    },
+    {
+      "run": 4,
+      "provider": "transformers_ranker_fallback",
+      "candidate_id": "cand_04",
+      "latency_ms": 0.153,
+      "rationale": "Transformers fallback selected cand_04 via local ranker; active_model_enabled=False; active_model_available=False."
+    }
+  ],
+  "avg_latency_ms": 350.526,
+  "provider_requested": "transformers",
+  "model": "Qwen/Qwen2.5-0.5B-Instruct"
+}

outputs/reports/plot_index.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "plots": [
+    "/app/outputs/plots/avg_reward.png",
+    "/app/outputs/plots/legality_rate.png",
+    "/app/outputs/plots/success_rate.png",
+    "/app/outputs/plots/avg_process_fidelity.png",
+    "/app/outputs/plots/policy_stack_avg_reward.png"
+  ]
+}

outputs/reports/robustness.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "missing_labs_safety_rate": 0.666667,
+  "noisy_dose_info_safety_rate": 1.0,
+  "conflicting_meds_safety_rate": 1.0,
+  "alias_noise_safety_rate": 1.0,
+  "hidden_duplicate_detection_rate": 1.0,
+  "wrong_candidate_id_resilience": 1.0,
+  "stale_evidence_safety_rate": 1.0,
+  "delayed_ade_manifestation_safety_rate": 1.0
+}