FailSafe / download_osf.py
rcrane4's picture
Upload 10 files
7538d69 verified
"""
download_osf.py
---------------
Downloads the OSF Ti-64 SEM fractography dataset (osf.io/gdwyb).
The dataset has 3 sub-components:
- Lack of Fusion defects
- Keyhole defects
- All Defects (combined)
Each sub-dataset contains SEM images + ground truth segmentation masks.
Usage:
python download_osf.py
Output structure:
data/
lack_of_fusion/
images/
masks/
keyhole/
images/
masks/
all_defects/
images/
masks/
"""
import os
import requests
from pathlib import Path
from tqdm import tqdm
# OSF project GUIDs for each sub-dataset
# Inspect at: https://osf.io/gdwyb/
OSF_API = "https://api.osf.io/v2"
OSF_PROJECT_ID = "gdwyb" # top-level fractography project
DATA_DIR = Path("data")
def list_osf_files(node_id: str) -> list[dict]:
"""Recursively list all files in an OSF node."""
url = f"{OSF_API}/nodes/{node_id}/files/osfstorage/"
files = []
while url:
resp = requests.get(url, timeout=30)
resp.raise_for_status()
data = resp.json()
for item in data["data"]:
if item["attributes"]["kind"] == "file":
files.append({
"name": item["attributes"]["name"],
"path": item["attributes"]["materialized_path"],
"download": item["links"]["download"],
"size": item["attributes"]["size"],
})
elif item["attributes"]["kind"] == "folder":
# recurse into folders
folder_id = item["relationships"]["files"]["links"]["related"]["href"]
files.extend(list_osf_folder(folder_id))
url = data["links"].get("next")
return files
def list_osf_folder(url: str) -> list[dict]:
"""Recursively list files inside an OSF folder URL."""
files = []
while url:
resp = requests.get(url, timeout=30)
resp.raise_for_status()
data = resp.json()
for item in data["data"]:
if item["attributes"]["kind"] == "file":
files.append({
"name": item["attributes"]["name"],
"path": item["attributes"]["materialized_path"],
"download": item["links"]["download"],
"size": item["attributes"]["size"],
})
elif item["attributes"]["kind"] == "folder":
folder_url = item["relationships"]["files"]["links"]["related"]["href"]
files.extend(list_osf_folder(folder_url))
url = data["links"].get("next")
return files
def download_file(url: str, dest: Path):
"""Download a file with a progress bar."""
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
print(f" [skip] {dest.name} already exists")
return
resp = requests.get(url, stream=True, timeout=60)
resp.raise_for_status()
total = int(resp.headers.get("content-length", 0))
with open(dest, "wb") as f, tqdm(
desc=dest.name, total=total, unit="B", unit_scale=True, leave=False
) as bar:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
bar.update(len(chunk))
def download_osf_project(node_id: str, local_root: Path):
"""Download all files from an OSF node into local_root, preserving folder structure."""
print(f"\n📂 Fetching file list from OSF node: {node_id}")
try:
files = list_osf_files(node_id)
except Exception as e:
print(f" ⚠️ Could not list files: {e}")
print(" → You may need to download manually from https://osf.io/gdwyb/")
return []
print(f" Found {len(files)} files")
for f in files:
# strip leading slash from materialized path
rel_path = f["path"].lstrip("/")
dest = local_root / rel_path
print(f" ↓ {rel_path} ({f['size'] / 1024:.1f} KB)")
try:
download_file(f["download"], dest)
except Exception as e:
print(f" ⚠️ Failed: {e}")
return files
if __name__ == "__main__":
DATA_DIR.mkdir(exist_ok=True)
print("=" * 60)
print("OSF Ti-64 Fractography Dataset Downloader")
print("Project: https://osf.io/gdwyb/")
print("=" * 60)
files = download_osf_project(OSF_PROJECT_ID, DATA_DIR)
if files:
print(f"\n✅ Download complete. Files saved to: {DATA_DIR.resolve()}")
else:
print("\n⚠️ Automatic download failed.")
print("Manual download steps:")
print(" 1. Go to https://osf.io/gdwyb/")
print(" 2. Click each sub-component (Lack of Fusion, Key Hole, All Defects)")
print(" 3. Download the zip and extract into data/<subfolder>/")
print(" Expected structure:")
print(" data/lack_of_fusion/images/*.png (or .tif)")
print(" data/lack_of_fusion/masks/*.png")