| |
| """ |
| 需求: |
| 统计 /home/data/pk-2089-L6_full_label.parquet 中: |
| (chosen_label == 'safe') 且 (reject_label == 'safe') 且 (chosen_model == 3089) |
| 的样本数;同时打印总样本数与比率。 |
| |
| 依赖:pandas, pyarrow(或 fastparquet) |
| pip install pandas pyarrow |
| """ |
|
|
| import pandas as pd |
|
|
| PATH = "/home/data/raw/test/1159-L6_format_full_label.parquet" |
| ID=2159 |
| def norm_label(x) -> str: |
| if pd.isna(x): |
| return "" |
| return str(x).strip().lower() |
|
|
| def main(): |
| df = pd.read_parquet(PATH) |
|
|
| |
| chosen_label = df.get("chosen_label").map(norm_label) |
| reject_label = df.get("reject_label").map(norm_label) |
|
|
| |
| chosen_model_num = pd.to_numeric(df.get("chosen_model"), errors="coerce") |
| mask1 = ( |
| (chosen_label == "safe") & |
| (reject_label == "safe") & |
| (chosen_model_num == ID) |
| ) |
| mask2 = ( |
| (chosen_label == "safe") & |
| (reject_label == "safe") |
| ) |
| mask3 = ( |
| (chosen_label == "unsafe") & |
| (reject_label == "safe") & |
| (chosen_model_num == ID) |
| ) |
| mask4 = ( |
| (chosen_label == "unsafe") & |
| (reject_label == "safe") |
| ) |
| mask5 = ( |
| (chosen_label == "unsafe") & |
| (reject_label == "unsafe") & |
| (chosen_model_num == ID) |
| ) |
| mask6 = ( |
| (chosen_label == "unsafe") & |
| (reject_label == "unsafe") |
| ) |
| mask7 =(chosen_label == "safe") |
| safenum =int(mask7.sum()) |
| count1 = int(mask1.sum()) |
| total1 = int(mask2.sum()) |
| count2 = int(mask3.sum()) |
| total2 = int(mask4.sum()) |
| count3 = int(mask5.sum()) |
| total3 = int(mask6.sum()) |
| ratio1 = (count1 / total1) if total1 > 0 else 0.0 |
| ratio2 = (count2 / total2) if total2 > 0 else 0.0 |
| ratio3 = (count3 / total3) if total3 > 0 else 0.0 |
| saferatio= (safenum / len(df)) if len(df) > 0 else 0.0 |
| print(f"安全率={saferatio:.6f} ({safenum}/{len(df)})") |
| print(f"比率: {ratio1:.6f} ({count1}/{total1})," |
| f"{ratio2:.6f} ({count2}/{total2})," |
| f" {ratio3:.6f} ({count3}/{total3})") |
|
|
| if __name__ == "__main__": |
| main() |
|
|