{ "cells": [ { "cell_type": "markdown", "id": "overview", "metadata": {}, "source": [ "# scPerturb 数据下载\n", "\n", "这个 notebook 用来下载 `scPerturb` 中的单细胞扰动数据。这里额外把“基因扰动、适合作为单点入口”的推荐数据集单独整理出来，方便直接选择下载。" ] }, { "cell_type": "markdown", "id": "sources", "metadata": {}, "source": [ "## 数据来源与本 notebook 的选择标准\n", "\n", "- 官方资源页：\n", "- 论文：*scPerturb: information resource for harmonized single-cell perturbation data*，Nature Methods，2024，DOI: \n", "- 截至 2026-04-06，这里使用的官方下载入口为：\n", " - RNA / protein：Zenodo record `13350497`\n", " - ATAC：Zenodo record `7058382`\n", "\n", "这里把“单点”近似理解为：**优先选择单基因 CRISPR 扰动、适合作为单 perturbation 学习入口的数据集**。因此默认推荐：\n", "\n", "- Adamson / Weissman 2016\n", "- Replogle / Weissman 2022 K562\n", "- Replogle / Weissman 2022 RPE1\n", "\n", "像 `DixitRegev2016_K562_TFs_High_MOI` 这种明显带高 MOI 标记的文件，不放进默认“单点推荐”里。需要注意的是，**是否严格每个细胞只有一个 perturbation**，最终仍建议在下游读取 `.h5ad` 后，检查 `obs` 中的 guide / condition / target gene 等字段再次确认。" ] }, { "cell_type": "code", "execution_count": 4, "id": "setup", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project root: /Users/yurujiang/Desktop/final_thesis\n", "Download root: /Users/yurujiang/Desktop/final_thesis/data/scPerturb\n", "- rna_protein: scRNA-seq / CITE-seq -> Zenodo 13350497\n", "- atac: scATAC-seq -> Zenodo 7058382\n", "Recommended gene-perturbation presets:\n", "- adamson: Adamson / Weissman 2016\n", "- replogle_k562: Replogle / Weissman 2022 K562\n", "- replogle_rpe1: Replogle / Weissman 2022 RPE1\n" ] } ], "source": [ "from __future__ import annotations\n", "\n", "from pathlib import Path\n", "import hashlib\n", "import os\n", "import requests\n", "\n", "try:\n", " from tqdm.auto import tqdm\n", "except ImportError:\n", " tqdm = None\n", "\n", "\n", "def find_project_root(start: Path) -> Path:\n", " for candidate in [start, *start.parents]:\n", " if (candidate / \"src\").exists() and (candidate / \"data\").exists():\n", " return candidate\n", " return start\n", "\n", "\n", "PROJECT_ROOT = find_project_root(Path.cwd().resolve())\n", "DATA_ROOT = PROJECT_ROOT / \"data\" / \"scPerturb\"\n", "DATA_ROOT.mkdir(parents=True, exist_ok=True)\n", "\n", "SC_PERTURB_RECORDS = {\n", " \"rna_protein\": {\n", " \"record_id\": \"13350497\",\n", " \"label\": \"scRNA-seq / CITE-seq\",\n", " \"target_dir\": DATA_ROOT / \"rna_protein\",\n", " \"reference\": \"https://zenodo.org/records/13350497\",\n", " \"note\": \"截至 2026-04-06，官方页面列出的 RNA / protein 数据集合。\",\n", " },\n", " \"atac\": {\n", " \"record_id\": \"7058382\",\n", " \"label\": \"scATAC-seq\",\n", " \"target_dir\": DATA_ROOT / \"atac\",\n", " \"reference\": \"https://zenodo.org/records/7058382\",\n", " \"note\": \"截至 2026-04-06，官方页面列出的 ATAC 数据集合。\",\n", " },\n", "}\n", "\n", "GENE_PERTURBATION_PRESETS = {\n", " \"adamson\": {\n", " \"record_key\": \"rna_protein\",\n", " \"label\": \"Adamson / Weissman 2016\",\n", " \"filenames\": [\n", " \"AdamsonWeissman2016_GSM2406675_10X001.h5ad\",\n", " \"AdamsonWeissman2016_GSM2406677_10X005.h5ad\",\n", " \"AdamsonWeissman2016_GSM2406681_10X010.h5ad\",\n", " ],\n", " \"reason\": \"经典 Perturb-seq 基因扰动数据，适合作为单基因扰动入门。\",\n", " },\n", " \"replogle_k562\": {\n", " \"record_key\": \"rna_protein\",\n", " \"label\": \"Replogle / Weissman 2022 K562\",\n", " \"filenames\": [\n", " \"ReplogleWeissman2022_K562_essential.h5ad\",\n", " \"ReplogleWeissman2022_K562_gwps.h5ad\",\n", " ],\n", " \"reason\": \"覆盖面广，是当前最值得优先下载的 K562 单基因扰动资源之一。\",\n", " },\n", " \"replogle_rpe1\": {\n", " \"record_key\": \"rna_protein\",\n", " \"label\": \"Replogle / Weissman 2022 RPE1\",\n", " \"filenames\": [\n", " \"ReplogleWeissman2022_rpe1.h5ad\",\n", " ],\n", " \"reason\": \"和 K562 形成互补，适合比较不同细胞背景下的单基因扰动效应。\",\n", " },\n", "}\n", "\n", "RECOMMENDED_PRESET_ORDER = [\"adamson\", \"replogle_k562\", \"replogle_rpe1\"]\n", "\n", "print(f\"Project root: {PROJECT_ROOT}\")\n", "print(f\"Download root: {DATA_ROOT}\")\n", "for key, cfg in SC_PERTURB_RECORDS.items():\n", " print(f\"- {key}: {cfg['label']} -> Zenodo {cfg['record_id']}\")\n", "print(\"Recommended gene-perturbation presets:\")\n", "for preset_key in RECOMMENDED_PRESET_ORDER:\n", " preset = GENE_PERTURBATION_PRESETS[preset_key]\n", " print(f\"- {preset_key}: {preset['label']}\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "helpers", "metadata": {}, "outputs": [], "source": [ "def zenodo_record(record_id: str) -> dict:\n", " response = requests.get(f\"https://zenodo.org/api/records/{record_id}\", timeout=60)\n", " response.raise_for_status()\n", " return response.json()\n", "\n", "\n", "def get_scperturb_files(record_key: str) -> list[dict]:\n", " cfg = SC_PERTURB_RECORDS[record_key]\n", " record = zenodo_record(cfg[\"record_id\"])\n", " files = []\n", " for file_info in record.get(\"files\", []):\n", " links = file_info.get(\"links\", {})\n", " files.append(\n", " {\n", " \"filename\": file_info.get(\"key\") or file_info.get(\"filename\"),\n", " \"size_bytes\": file_info.get(\"size\", 0),\n", " \"checksum\": file_info.get(\"checksum\"),\n", " \"download_url\": links.get(\"content\") or links.get(\"self\") or file_info.get(\"url\"),\n", " }\n", " )\n", " return files\n", "\n", "\n", "def format_size(size_bytes: int) -> str:\n", " units = [\"B\", \"KB\", \"MB\", \"GB\", \"TB\"]\n", " value = float(size_bytes)\n", " for unit in units:\n", " if value < 1024 or unit == units[-1]:\n", " return f\"{value:.2f} {unit}\"\n", " value /= 1024\n", " return f\"{size_bytes} B\"\n", "\n", "\n", "def show_scperturb_files(record_key: str) -> None:\n", " cfg = SC_PERTURB_RECORDS[record_key]\n", " files = get_scperturb_files(record_key)\n", " print(f\"[{record_key}] {cfg['label']}\")\n", " print(cfg[\"reference\"])\n", " print(cfg[\"note\"])\n", " print(f\"Total files: {len(files)}\")\n", " print(\"-\" * 100)\n", " for item in files:\n", " print(f\"{item['filename']:<60} {format_size(item['size_bytes']):>12}\")\n", "\n", "\n", "def show_gene_perturbation_presets() -> None:\n", " print(\"Recommended gene perturbation presets:\")\n", " print(\"-\" * 100)\n", " all_files = {item['filename']: item for item in get_scperturb_files('rna_protein')}\n", " for preset_key in RECOMMENDED_PRESET_ORDER:\n", " preset = GENE_PERTURBATION_PRESETS[preset_key]\n", " print(f\"[{preset_key}] {preset['label']}\")\n", " print(preset['reason'])\n", " for filename in preset['filenames']:\n", " size_text = \"not found in current record\"\n", " if filename in all_files:\n", " size_text = format_size(all_files[filename]['size_bytes'])\n", " print(f\" - {filename} ({size_text})\")\n", " print()\n", "\n", "\n", "def md5sum(path: Path, chunk_size: int = 1024 * 1024) -> str:\n", " digest = hashlib.md5()\n", " with path.open(\"rb\") as handle:\n", " for chunk in iter(lambda: handle.read(chunk_size), b\"\"):\n", " digest.update(chunk)\n", " return digest.hexdigest()\n", "\n", "\n", "def download_scperturb(record_key: str, filenames: list[str] | None = None, overwrite: bool = False) -> None:\n", " cfg = SC_PERTURB_RECORDS[record_key]\n", " target_dir = cfg[\"target_dir\"]\n", " target_dir.mkdir(parents=True, exist_ok=True)\n", "\n", " files = get_scperturb_files(record_key)\n", " if filenames is not None:\n", " requested = set(filenames)\n", " files = [item for item in files if item[\"filename\"] in requested]\n", " missing = sorted(requested - {item[\"filename\"] for item in files})\n", " if missing:\n", " raise FileNotFoundError(f\"These files are not present in Zenodo record {cfg['record_id']}: {missing}\")\n", "\n", " if not files:\n", " raise ValueError(\"No files selected for download.\")\n", "\n", " for item in files:\n", " output_path = target_dir / item[\"filename\"]\n", " expected_md5 = (item.get(\"checksum\") or \"\").replace(\"md5:\", \"\")\n", "\n", " if output_path.exists() and not overwrite:\n", " if expected_md5 and md5sum(output_path) == expected_md5:\n", " print(f\"Skip existing file: {output_path.name}\")\n", " continue\n", " print(f\"File exists but checksum mismatch, re-downloading: {output_path.name}\")\n", "\n", " print(f\"Downloading {item['filename']} -> {output_path}\")\n", " with requests.get(item[\"download_url\"], stream=True, timeout=60) as response:\n", " response.raise_for_status()\n", " total = int(response.headers.get(\"content-length\", item[\"size_bytes\"]))\n", " temp_path = output_path.with_suffix(output_path.suffix + \".part\")\n", "\n", " progress = None\n", " if tqdm is not None:\n", " progress = tqdm(total=total, unit=\"B\", unit_scale=True, desc=item[\"filename\"])\n", "\n", " with temp_path.open(\"wb\") as handle:\n", " for chunk in response.iter_content(chunk_size=1024 * 1024):\n", " if not chunk:\n", " continue\n", " handle.write(chunk)\n", " if progress is not None:\n", " progress.update(len(chunk))\n", "\n", " if progress is not None:\n", " progress.close()\n", "\n", " os.replace(temp_path, output_path)\n", "\n", " if expected_md5:\n", " actual_md5 = md5sum(output_path)\n", " if actual_md5 != expected_md5:\n", " raise ValueError(\n", " f\"Checksum mismatch for {output_path.name}: expected {expected_md5}, got {actual_md5}\"\n", " )\n", "\n", " print(f\"Finished: {output_path.name}\")\n", "\n", "\n", "def download_gene_perturbation_preset(preset_key: str, overwrite: bool = False) -> None:\n", " preset = GENE_PERTURBATION_PRESETS[preset_key]\n", " download_scperturb(\n", " record_key=preset['record_key'],\n", " filenames=preset['filenames'],\n", " overwrite=overwrite,\n", " )" ] }, { "cell_type": "markdown", "id": "recommendation-notes", "metadata": {}, "source": [ "## 基因扰动推荐选择\n", "\n", "如果你的目标是先做**基因扰动、并尽量从单基因 perturbation 开始**，优先推荐下面三组：\n", "\n", "- `adamson`：经典 Perturb-seq 数据，适合作为起点。\n", "- `replogle_k562`：覆盖广、规模大，是最值得优先下载的一组之一。\n", "- `replogle_rpe1`：和 K562 构成不同细胞背景的对照。\n", "\n", "这三组都已经在下面做成 preset，可以直接调用。" ] }, { "cell_type": "code", "execution_count": 6, "id": "preset-preview", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recommended gene perturbation presets:\n", "----------------------------------------------------------------------------------------------------\n", "[adamson] Adamson / Weissman 2016\n", "经典 Perturb-seq 基因扰动数据，适合作为单基因扰动入门。\n", " - AdamsonWeissman2016_GSM2406675_10X001.h5ad (32.96 MB)\n", " - AdamsonWeissman2016_GSM2406677_10X005.h5ad (132.62 MB)\n", " - AdamsonWeissman2016_GSM2406681_10X010.h5ad (449.45 MB)\n", "\n", "[replogle_k562] Replogle / Weissman 2022 K562\n", "覆盖面广，是当前最值得优先下载的 K562 单基因扰动资源之一。\n", " - ReplogleWeissman2022_K562_essential.h5ad (1.44 GB)\n", " - ReplogleWeissman2022_K562_gwps.h5ad (8.20 GB)\n", "\n", "[replogle_rpe1] Replogle / Weissman 2022 RPE1\n", "和 K562 形成互补，适合比较不同细胞背景下的单基因扰动效应。\n", " - ReplogleWeissman2022_rpe1.h5ad (1.15 GB)\n", "\n" ] } ], "source": [ "# 查看推荐的基因扰动数据集及对应文件名\n", "show_gene_perturbation_presets()" ] }, { "cell_type": "code", "execution_count": 7, "id": "general-preview", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[rna_protein] scRNA-seq / CITE-seq\n", "https://zenodo.org/records/13350497\n", "截至 2026-04-06，官方页面列出的 RNA / protein 数据集合。\n", "Total files: 54\n", "----------------------------------------------------------------------------------------------------\n", "NadigOConner2024_hepg2.h5ad 811.19 MB\n", "DixitRegev2016_K562_TFs_13_days.h5ad 115.55 MB\n", "DixitRegev2016_K562_TFs_High_MOI.h5ad 304.92 MB\n", "ShifrutMarson2018.h5ad 831.11 MB\n", "NadigOConner2024_jurkat.h5ad 1.20 GB\n", "SrivatsanTrapnell2020_sciplex3.h5ad 2.35 GB\n", "DixitRegev2016_K562_TFs_7_days.h5ad 244.79 MB\n", "LaraAstiasoHuntly2023_exvivo.h5ad 1.01 GB\n", "AdamsonWeissman2016_GSM2406681_10X010.h5ad 449.45 MB\n", "AissaBenevolenskaya2021.h5ad 43.79 MB\n", "AdamsonWeissman2016_GSM2406675_10X001.h5ad 32.96 MB\n", "AdamsonWeissman2016_GSM2406677_10X005.h5ad 132.62 MB\n", "ChangYe2021.h5ad 478.58 MB\n", "DatlingerBock2017.h5ad 37.27 MB\n", "DatlingerBock2021.h5ad 32.04 MB\n", "FrangiehIzar2021_protein.h5ad 23.57 MB\n", "FrangiehIzar2021_RNA.h5ad 1.36 GB\n", "JoungZhang2023_combinatorial.h5ad 795.43 MB\n", "SunshineHein2023.h5ad 709.22 MB\n", "PapalexiSatija2021_eccite_arrayed_RNA.h5ad 49.91 MB\n", "PapalexiSatija2021_eccite_protein.h5ad 1.14 MB\n", "PapalexiSatija2021_eccite_RNA.h5ad 140.40 MB\n", "GasperiniShendure2019_atscale.h5ad 1.74 GB\n", "GasperiniShendure2019_highMOI.h5ad 398.96 MB\n", "GasperiniShendure2019_lowMOI.h5ad 307.77 MB\n", "GehringPachter2019.h5ad 69.88 MB\n", "McFarlandTsherniak2020.h5ad 1.36 GB\n", "ReplogleWeissman2022_K562_essential.h5ad 1.44 GB\n", "ReplogleWeissman2022_K562_gwps.h5ad 8.20 GB\n", "NormanWeissman2019_filtered.h5ad 666.31 MB\n", "PapalexiSatija2021_eccite_arrayed_protein.h5ad 538.01 KB\n", "ReplogleWeissman2022_rpe1.h5ad 1.15 GB\n", "SchiebingerLander2019_GSE106340.h5ad 361.31 MB\n", "SchiebingerLander2019_GSE115943.h5ad 1.53 GB\n", "SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_8_screen.h5ad 20.04 MB\n", "SrivatsanTrapnell2020_sciplex2.h5ad 138.45 MB\n", "SrivatsanTrapnell2020_sciplex4.h5ad 241.60 MB\n", "TianKampmann2019_day7neuron.h5ad 256.46 MB\n", "TianKampmann2019_iPSC.h5ad 334.60 MB\n", "TianKampmann2021_CRISPRa.h5ad 147.62 MB\n", "SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_11_screen.h5ad 20.81 MB\n", "TianKampmann2021_CRISPRi.h5ad 275.98 MB\n", "WeinrebKlein2020.h5ad 217.94 MB\n", "XieHon2017.h5ad 111.92 MB\n", "ZhaoSims2021.h5ad 559.70 MB\n", "JoungZhang2023_atlas.h5ad 5.41 GB\n", "XuCao2023.h5ad 321.04 MB\n", "LaraAstiasoHuntly2023_leukemia.h5ad 1.10 GB\n", "LaraAstiasoHuntly2023_invivo.h5ad 725.09 MB\n", "SantinhaPlatt2023.h5ad 895.82 MB\n", "LiangWang2023.h5ad 347.30 MB\n", "CuiHacohen2023.h5ad 450.24 MB\n", "WesselsSatija2023.h5ad 209.23 MB\n", "LotfollahiTheis2023.h5ad 225.03 MB\n" ] } ], "source": [ "# 如需查看整个 RNA / protein 记录中的全部文件，可运行这一格\n", "show_scperturb_files(\"rna_protein\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "download-examples", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading AdamsonWeissman2016_GSM2406681_10X010.h5ad -> /Users/yurujiang/Desktop/final_thesis/data/scPerturb/rna_protein/AdamsonWeissman2016_GSM2406681_10X010.h5ad\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "AdamsonWeissman2016_GSM2406681_10X010.h5ad: 100%|██████████| 471M/471M [00:35<00:00, 13.2MB/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished: AdamsonWeissman2016_GSM2406681_10X010.h5ad\n", "Downloading AdamsonWeissman2016_GSM2406675_10X001.h5ad -> /Users/yurujiang/Desktop/final_thesis/data/scPerturb/rna_protein/AdamsonWeissman2016_GSM2406675_10X001.h5ad\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "AdamsonWeissman2016_GSM2406675_10X001.h5ad: 100%|██████████| 34.6M/34.6M [00:09<00:00, 3.66MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished: AdamsonWeissman2016_GSM2406675_10X001.h5ad\n", "Downloading AdamsonWeissman2016_GSM2406677_10X005.h5ad -> /Users/yurujiang/Desktop/final_thesis/data/scPerturb/rna_protein/AdamsonWeissman2016_GSM2406677_10X005.h5ad\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "AdamsonWeissman2016_GSM2406677_10X005.h5ad: 100%|██████████| 139M/139M [00:10<00:00, 13.6MB/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished: AdamsonWeissman2016_GSM2406677_10X005.h5ad\n", "Downloading ReplogleWeissman2022_K562_essential.h5ad -> /Users/yurujiang/Desktop/final_thesis/data/scPerturb/rna_protein/ReplogleWeissman2022_K562_essential.h5ad\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "ReplogleWeissman2022_K562_essential.h5ad: 100%|██████████| 1.55G/1.55G [01:58<00:00, 13.1MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished: ReplogleWeissman2022_K562_essential.h5ad\n", "Downloading ReplogleWeissman2022_K562_gwps.h5ad -> /Users/yurujiang/Desktop/final_thesis/data/scPerturb/rna_protein/ReplogleWeissman2022_K562_gwps.h5ad\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "ReplogleWeissman2022_K562_gwps.h5ad: 2%|▏ | 164M/8.81G [00:12<10:58, 13.1MB/s] " ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# 只下载 Adamson\u001b[39;00m\n\u001b[32m 4\u001b[39m download_gene_perturbation_preset(\u001b[33m\"adamson\"\u001b[39m)\n\u001b[32m 5\u001b[39m \n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# 只下载 Replogle K562\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m download_gene_perturbation_preset(\u001b[33m\"replogle_k562\"\u001b[39m)\n\u001b[32m 8\u001b[39m \n\u001b[32m 9\u001b[39m \u001b[38;5;66;03m# 只下载 Replogle RPE1\u001b[39;00m\n\u001b[32m 10\u001b[39m download_gene_perturbation_preset(\u001b[33m\"replogle_rpe1\"\u001b[39m)\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 131\u001b[39m, in \u001b[36mdownload_gene_perturbation_preset\u001b[39m\u001b[34m(preset_key, overwrite)\u001b[39m\n\u001b[32m 129\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m download_gene_perturbation_preset(preset_key: str, overwrite: bool = \u001b[38;5;28;01mFalse\u001b[39;00m) -> \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 130\u001b[39m preset = GENE_PERTURBATION_PRESETS[preset_key]\n\u001b[32m--> \u001b[39m\u001b[32m131\u001b[39m download_scperturb(\n\u001b[32m 132\u001b[39m record_key=preset[\u001b[33m'record_key'\u001b[39m],\n\u001b[32m 133\u001b[39m filenames=preset[\u001b[33m'filenames'\u001b[39m],\n\u001b[32m 134\u001b[39m overwrite=overwrite,\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 107\u001b[39m, in \u001b[36mdownload_scperturb\u001b[39m\u001b[34m(record_key, filenames, overwrite)\u001b[39m\n\u001b[32m 103\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m tqdm \u001b[38;5;28;01mis\u001b[39;00m \u001b[38;5;28;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 104\u001b[39m progress = tqdm(total=total, unit=\u001b[33m\"B\"\u001b[39m, unit_scale=\u001b[38;5;28;01mTrue\u001b[39;00m, desc=item[\u001b[33m\"filename\"\u001b[39m])\n\u001b[32m 105\u001b[39m \n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m temp_path.open(\u001b[33m\"wb\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m handle:\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;28;01min\u001b[39;00m response.iter_content(chunk_size=\u001b[32m1024\u001b[39m * \u001b[32m1024\u001b[39m):\n\u001b[32m 108\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28;01mnot\u001b[39;00m chunk:\n\u001b[32m 109\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m 110\u001b[39m handle.write(chunk)\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/site-packages/requests/models.py:820\u001b[39m, in \u001b[36mResponse.iter_content..generate\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 818\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m.raw, \u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 819\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m820\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.raw.stream(chunk_size, decode_content=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 821\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 822\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ChunkedEncodingError(e)\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/site-packages/urllib3/response.py:1091\u001b[39m, in \u001b[36mHTTPResponse.stream\u001b[39m\u001b[34m(self, amt, decode_content)\u001b[39m\n\u001b[32m 1089\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1090\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_fp_closed(\u001b[38;5;28mself\u001b[39m._fp) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) > \u001b[32m0\u001b[39m:\n\u001b[32m-> \u001b[39m\u001b[32m1091\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m=\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1093\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m data:\n\u001b[32m 1094\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m data\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/site-packages/urllib3/response.py:980\u001b[39m, in \u001b[36mHTTPResponse.read\u001b[39m\u001b[34m(self, amt, decode_content, cache_content)\u001b[39m\n\u001b[32m 977\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) >= amt:\n\u001b[32m 978\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._decoded_buffer.get(amt)\n\u001b[32m--> \u001b[39m\u001b[32m980\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raw_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 982\u001b[39m flush_decoder = amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m (amt != \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data)\n\u001b[32m 984\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) == \u001b[32m0\u001b[39m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/site-packages/urllib3/response.py:904\u001b[39m, in \u001b[36mHTTPResponse._raw_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 901\u001b[39m fp_closed = \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m._fp, \u001b[33m\"\u001b[39m\u001b[33mclosed\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 903\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m._error_catcher():\n\u001b[32m--> \u001b[39m\u001b[32m904\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mread1\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m fp_closed \u001b[38;5;28;01melse\u001b[39;00m \u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 905\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt != \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data:\n\u001b[32m 906\u001b[39m \u001b[38;5;66;03m# Platform-specific: Buggy versions of Python.\u001b[39;00m\n\u001b[32m 907\u001b[39m \u001b[38;5;66;03m# Close the connection when no data is returned\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 912\u001b[39m \u001b[38;5;66;03m# not properly close the connection in all cases. There is\u001b[39;00m\n\u001b[32m 913\u001b[39m \u001b[38;5;66;03m# no harm in redundantly calling close.\u001b[39;00m\n\u001b[32m 914\u001b[39m \u001b[38;5;28mself\u001b[39m._fp.close()\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/site-packages/urllib3/response.py:887\u001b[39m, in \u001b[36mHTTPResponse._fp_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 884\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._fp.read1(amt) \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m._fp.read1()\n\u001b[32m 885\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 886\u001b[39m \u001b[38;5;66;03m# StringIO doesn't like amt=None\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m887\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m._fp.read()\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/http/client.py:479\u001b[39m, in \u001b[36mHTTPResponse.read\u001b[39m\u001b[34m(self, amt)\u001b[39m\n\u001b[32m 476\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.length \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt > \u001b[38;5;28mself\u001b[39m.length:\n\u001b[32m 477\u001b[39m \u001b[38;5;66;03m# clip the read to the \"end of response\"\u001b[39;00m\n\u001b[32m 478\u001b[39m amt = \u001b[38;5;28mself\u001b[39m.length\n\u001b[32m--> \u001b[39m\u001b[32m479\u001b[39m s = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 480\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m s \u001b[38;5;129;01mand\u001b[39;00m amt:\n\u001b[32m 481\u001b[39m \u001b[38;5;66;03m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[32m 482\u001b[39m \u001b[38;5;66;03m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n\u001b[32m 483\u001b[39m \u001b[38;5;28mself\u001b[39m._close_conn()\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/socket.py:719\u001b[39m, in \u001b[36mSocketIO.readinto\u001b[39m\u001b[34m(self, b)\u001b[39m\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mcannot read from timed out object\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 718\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m719\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sock\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 720\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[32m 721\u001b[39m \u001b[38;5;28mself\u001b[39m._timeout_occurred = \u001b[38;5;28;01mTrue\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/ssl.py:1304\u001b[39m, in \u001b[36mSSLSocket.recv_into\u001b[39m\u001b[34m(self, buffer, nbytes, flags)\u001b[39m\n\u001b[32m 1300\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m flags != \u001b[32m0\u001b[39m:\n\u001b[32m 1301\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 1302\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m %\n\u001b[32m 1303\u001b[39m \u001b[38;5;28mself\u001b[39m.\u001b[34m__class__\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1304\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1305\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1306\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().recv_into(buffer, nbytes, flags)\n", "\u001b[36mFile \u001b[39m\u001b[32m/opt/miniconda3/lib/python3.13/ssl.py:1138\u001b[39m, in \u001b[36mSSLSocket.read\u001b[39m\u001b[34m(self, len, buffer)\u001b[39m\n\u001b[32m 1136\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1137\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1138\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sslobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1139\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1140\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._sslobj.read(\u001b[38;5;28mlen\u001b[39m)\n", "\u001b[31mKeyboardInterrupt\u001b[39m: " ] }, { "name": "stderr", "output_type": "stream", "text": [ "ReplogleWeissman2022_K562_gwps.h5ad: 2%|▏ | 164M/8.81G [00:25<10:58, 13.1MB/s]" ] } ], "source": [ "# 按需取消注释后运行。\n", "\n", "# 只下载 Adamson\n", "download_gene_perturbation_preset(\"adamson\")\n", "\n", "# 只下载 Replogle K562\n", "download_gene_perturbation_preset(\"replogle_k562\")\n", "\n", "# 只下载 Replogle RPE1\n", "download_gene_perturbation_preset(\"replogle_rpe1\")\n", "\n", "# 把三组都下载下来\n", "# for preset_key in RECOMMENDED_PRESET_ORDER:\n", "# download_gene_perturbation_preset(preset_key)\n", "\n", "# 覆盖已有文件\n", "# download_gene_perturbation_preset(\"replogle_k562\", overwrite=True)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }