{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "83d855d7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved: train_effusion_findings.json\n", "Input1: 551, Kept(matched): 85, Dropped: 466\n" ] } ], "source": [ "import json, re\n", "from pathlib import Path\n", "from collections import OrderedDict\n", "\n", "# ====== paths (modify) ======\n", "jsonl_1 = \"/home/shuhan/blobdata_sd/CT-RATE/train_4k_effusion.json\"\n", "jsonl_2 = \"/home/shuhan/blobdata_sd/CT-RATE/disease_mask_json/disease_train_single_prompt_checked_label.json\"\n", "out_jsonl = \"train_effusion_findings.json\"\n", "\n", "# jsonl_1 = \"/home/shuhan/blobdata_sd/CT-RATE/valid_effusion.json\"\n", "# jsonl_2 = \"/home/shuhan/blobdata_sd/CT-RATE/disease_mask_json/disease_valid_single_prompt_checked_label.json\"\n", "# out_jsonl = \"valid_effusion_findings.json\"\n", "\n", "\n", "def iter_jsonl(path):\n", " with open(path, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " line = line.strip()\n", " if line:\n", " yield json.loads(line)\n", "\n", "def volume_key(volume_path: str) -> str:\n", " \"\"\"match rule: ignore trailing _1/_2 before extension\"\"\"\n", " name = Path(volume_path).name\n", " if name.endswith(\".nii.gz\"):\n", " core = name[:-7]\n", " elif name.endswith(\".mha\"):\n", " core = name[:-4]\n", " else:\n", " core = Path(name).stem\n", " core = re.sub(r\"_(?:1|2)$\", \"\", core)\n", " return core\n", "\n", "# Build key -> merged disease_findings (dedup)\n", "key2findings = OrderedDict()\n", "for obj in iter_jsonl(jsonl_2):\n", " k = volume_key(obj.get(\"volume_path\", \"\"))\n", " df = (obj.get(\"disease_findings\") or \"\").strip()\n", " if not k:\n", " continue\n", " key2findings.setdefault(k, [])\n", " if df and df not in key2findings[k]:\n", " key2findings[k].append(df)\n", "\n", "kept = 0\n", "seen = 0\n", "with open(out_jsonl, \"w\", encoding=\"utf-8\") as w:\n", " for obj in iter_jsonl(jsonl_1):\n", " seen += 1\n", " k = volume_key(obj.get(\"volume_path\", \"\"))\n", " if k not in key2findings or not key2findings[k]:\n", " continue # only keep matched\n", " obj[\"disease_findings\"] = \" | \".join(key2findings[k])\n", " w.write(json.dumps(obj, ensure_ascii=False) + \"\\n\")\n", " kept += 1\n", "\n", "print(f\"Saved: {out_jsonl}\")\n", "print(f\"Input1: {seen}, Kept(matched): {kept}, Dropped: {seen - kept}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "008a1daa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==== Effusion compare (ignore _1/_2) ====\n", "Common volumes (exist in both): 2566\n", "1) JSONL has effusion AND CSV has effusion: 153\n", "2) JSONL has effusion BUT CSV no effusion: 23\n", "3) JSONL no effusion BUT CSV has effusion:108\n", "4) JSONL no effusion AND CSV no effusion: 2282\n", "\n", "[Coverage]\n", "Only in JSONL: 0\n", "Only in CSV: 22427\n" ] } ], "source": [ "import json, re\n", "from pathlib import Path\n", "import pandas as pd\n", "\n", "# ====== paths (modify) ======\n", "jsonl_2_path = \"/home/shuhan/blobdata_sd/CT-RATE/disease_mask_json/disease_train_single_prompt_checked_label.json\"\n", "csv_path = \"/home/shuhan/blobdata_sd/CT-RATE/multi_abnormality_labels/train_predicted_labels.csv\"\n", "\n", "def volume_key(p: str) -> str:\n", " \"\"\"Ignore trailing _1/_2 before extension (nii.gz/mha).\"\"\"\n", " name = Path(str(p).strip().strip('\"').strip(\"'\")).name\n", " if name.endswith(\".nii.gz\"):\n", " core = name[:-7]\n", " elif name.endswith(\".mha\"):\n", " core = name[:-4]\n", " else:\n", " core = Path(name).stem\n", " core = re.sub(r\"_(?:1|2)$\", \"\", core) # drop trailing _1/_2\n", " return core\n", "\n", "def json_has_effusion(obj: dict) -> bool:\n", " s = \" \".join([\n", " str(obj.get(\"disease_label\", \"\") or \"\"),\n", " str(obj.get(\"disease_label_text\", \"\") or \"\"),\n", " str(obj.get(\"disease_findings\", \"\") or \"\"),\n", " ]).lower()\n", " return \"effusion\" in s\n", "\n", "# -----------------------------\n", "# 1) JSONL: key -> effusion?\n", "# -----------------------------\n", "eff_json = {}\n", "with open(jsonl_2_path, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " line = line.strip()\n", " if not line:\n", " continue\n", " obj = json.loads(line)\n", " k = volume_key(obj.get(\"volume_path\", \"\"))\n", " if not k:\n", " continue\n", " eff_json[k] = eff_json.get(k, False) or json_has_effusion(obj)\n", "\n", "# -----------------------------\n", "# 2) CSV: key -> effusion?\n", "# Any column containing \"effusion\" == 1 -> effusion\n", "# -----------------------------\n", "df = pd.read_csv(csv_path)\n", "if \"VolumeName\" not in df.columns:\n", " raise ValueError(\"CSV must contain column: VolumeName\")\n", "\n", "eff_cols = [c for c in df.columns if \"effusion\" in c.lower()]\n", "if not eff_cols:\n", " raise ValueError(\"No effusion-related columns found in CSV (col name contains 'effusion').\")\n", "\n", "eff_csv = {}\n", "for _, row in df.iterrows():\n", " k = volume_key(row[\"VolumeName\"])\n", " vals = []\n", " for c in eff_cols:\n", " v = row.get(c, 0)\n", " try:\n", " vals.append(int(v))\n", " except:\n", " vals.append(0)\n", " is_eff = (max(vals) == 1)\n", " eff_csv[k] = eff_csv.get(k, False) or is_eff # aggregate _1/_2\n", "\n", "# -----------------------------\n", "# 3) Compare on common keys\n", "# -----------------------------\n", "common = set(eff_json) & set(eff_csv)\n", "\n", "both_yes = sum(1 for k in common if eff_json[k] and eff_csv[k])\n", "json_yes_csv_no = sum(1 for k in common if eff_json[k] and (not eff_csv[k]))\n", "json_no_csv_yes = sum(1 for k in common if (not eff_json[k]) and eff_csv[k])\n", "both_no = sum(1 for k in common if (not eff_json[k]) and (not eff_csv[k]))\n", "\n", "print(\"==== Effusion compare (ignore _1/_2) ====\")\n", "print(f\"Common volumes (exist in both): {len(common)}\")\n", "print(f\"1) JSONL has effusion AND CSV has effusion: {both_yes}\")\n", "print(f\"2) JSONL has effusion BUT CSV no effusion: {json_yes_csv_no}\")\n", "print(f\"3) JSONL no effusion BUT CSV has effusion:{json_no_csv_yes}\")\n", "print(f\"4) JSONL no effusion AND CSV no effusion: {both_no}\")\n", "\n", "# (optional) if you also want to know unmatched coverage:\n", "print(\"\\n[Coverage]\")\n", "print(f\"Only in JSONL: {len(set(eff_json) - set(eff_csv))}\")\n", "print(f\"Only in CSV: {len(set(eff_csv) - set(eff_json))}\")\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "35ba659d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total=370, effusion_records=72\n", "effusion label types:\n", "- Atelectasis\n" ] } ], "source": [ "import json\n", "from pathlib import Path\n", "\n", "in_jsonl = \"/home/shuhan/blobdata_sd/CT-RATE/disease_mask_json/disease_valid_single_prompt_checked_label.json\"\n", "out_jsonl = \"disease_valid_single_prompt_atelectasis, consolidation.json\" # 不想保存就改成 None\n", "\n", "def split_labels(s: str):\n", " return [x.strip() for x in str(s or \"\").split(\",\") if x.strip()]\n", "\n", "def is_effusion_label(label: str) -> bool:\n", " return \"atelectasis\" in label.lower()\n", "\n", "n_total = 0\n", "n_hit = 0\n", "effusion_types = set()\n", "\n", "out_f = open(out_jsonl, \"w\", encoding=\"utf-8\") if out_jsonl else None\n", "\n", "with open(in_jsonl, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " line = line.strip()\n", " if not line:\n", " continue\n", " n_total += 1\n", " rec = json.loads(line)\n", "\n", " labels = split_labels(rec.get(\"disease_label\", \"\")) # 只从 disease_label 中提取\n", " hit = [lb for lb in labels if is_effusion_label(lb)]\n", "\n", " if hit:\n", " n_hit += 1\n", " for h in hit:\n", " effusion_types.add(h)\n", " if out_f:\n", " out_f.write(json.dumps(rec, ensure_ascii=False) + \"\\n\")\n", "\n", "if out_f:\n", " out_f.close()\n", "\n", "print(f\"total={n_total}, effusion_records={n_hit}\")\n", "print(\"effusion label types:\")\n", "for x in sorted(effusion_types):\n", " print(\"-\", x)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "41db104f", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "src_path = \"/home/shuhan/blobdata_sd/CT-RATE/disease_mask_json/disease_train_single_prompt_checked_label.json\"\n", "target_label = \"Pleural effusion or thickening\"\n", "\n", "filtered = []\n", "with open(src_path, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " line = line.strip()\n", " if not line:\n", " continue\n", " obj = json.loads(line)\n", " if obj.get(\"disease_label\") == target_label:\n", " filtered.append(obj)\n", "\n", "filtered[:3] # preview\n" ] } ], "metadata": { "kernelspec": { "display_name": "genct", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }