| |
| """ |
| Reconstruction script for CSI-4CAST datasets. |
| |
| This script helps users reconstruct the original folder structure after downloading |
| datasets from the CSI-4CAST Hugging Face organization. |
| |
| Usage: |
| python reconstruction.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR] |
| |
| If no arguments provided, it will look for downloaded datasets in the current directory |
| and reconstruct the structure in a 'data' folder. |
| """ |
|
|
| import argparse |
| import shutil |
| from pathlib import Path |
|
|
|
|
| def create_directory_structure(base_path: Path): |
| """Create the original directory structure""" |
| dirs_to_create = [ |
| "stats", |
| "test/regular", |
| "test/generalization", |
| "train/regular" |
| ] |
| |
| for dir_path in dirs_to_create: |
| full_path = base_path / dir_path |
| full_path.mkdir(parents=True, exist_ok=True) |
| print(f"Created directory: {full_path}") |
|
|
| def find_downloaded_datasets(input_dir: Path): |
| """Find all downloaded dataset folders""" |
| datasets = { |
| 'stats': [], |
| 'test_regular': [], |
| 'test_generalization': [], |
| 'train_regular': [] |
| } |
| |
| |
| for item in input_dir.iterdir(): |
| if item.is_dir(): |
| if item.name == "stats": |
| datasets['stats'].append(item.name) |
| elif item.name.startswith("test_regular_"): |
| datasets['test_regular'].append(item.name) |
| elif item.name.startswith("test_generalization_"): |
| datasets['test_generalization'].append(item.name) |
| elif item.name.startswith("train_regular_"): |
| datasets['train_regular'].append(item.name) |
| |
| return datasets |
|
|
| def reconstruct_dataset(dataset_name: str, source_path: Path, target_path: Path, prefix_to_remove: str) -> bool: |
| """Reconstruct a single dataset by removing prefix and moving to target location""" |
| if prefix_to_remove: |
| |
| original_name = dataset_name[len(prefix_to_remove):] |
| else: |
| original_name = dataset_name |
| |
| target_folder = target_path / original_name |
| |
| if target_folder.exists(): |
| print(f"Warning: {target_folder} already exists, skipping...") |
| return False |
| |
| try: |
| shutil.copytree(str(source_path), str(target_folder)) |
| print(f"Reconstructed: {dataset_name} -> {target_folder}") |
| return True |
| except Exception as e: |
| print(f"Error reconstructing {dataset_name}: {e}") |
| return False |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Reconstruct CSI-4CAST dataset folder structure") |
| parser.add_argument("--input-dir", "-i", default=".", |
| help="Directory containing downloaded datasets (default: current directory)") |
| parser.add_argument("--output-dir", "-o", default="data", |
| help="Output directory for reconstructed structure (default: 'data')") |
| |
| args = parser.parse_args() |
| |
| input_dir = Path(args.input_dir).resolve() |
| output_dir = Path(args.output_dir).resolve() |
| |
| print(f"Looking for datasets in: {input_dir}") |
| print(f"Reconstructing structure in: {output_dir}") |
| print() |
| |
| |
| create_directory_structure(output_dir) |
| |
| |
| datasets = find_downloaded_datasets(input_dir) |
| |
| total_reconstructed = 0 |
| |
| |
| for dataset in datasets['stats']: |
| source_path = input_dir / dataset |
| target_path = output_dir / "stats" |
| if reconstruct_dataset(dataset, source_path, target_path, ""): |
| total_reconstructed += 1 |
| |
| |
| for dataset in datasets['test_regular']: |
| source_path = input_dir / dataset |
| target_path = output_dir / "test" / "regular" |
| if reconstruct_dataset(dataset, source_path, target_path, "test_regular_"): |
| total_reconstructed += 1 |
| |
| |
| for dataset in datasets['test_generalization']: |
| source_path = input_dir / dataset |
| target_path = output_dir / "test" / "generalization" |
| if reconstruct_dataset(dataset, source_path, target_path, "test_generalization_"): |
| total_reconstructed += 1 |
| |
| |
| for dataset in datasets['train_regular']: |
| source_path = input_dir / dataset |
| target_path = output_dir / "train" / "regular" |
| if reconstruct_dataset(dataset, source_path, target_path, "train_regular_"): |
| total_reconstructed += 1 |
| |
| print() |
| print("β
Reconstruction complete!") |
| print(f"Total datasets reconstructed: {total_reconstructed}") |
| print(f"Reconstructed structure available at: {output_dir}") |
| print() |
| print("Final structure:") |
| print("data/") |
| print("βββ stats/") |
| print("βββ test/") |
| print("β βββ regular/") |
| print("β βββ generalization/") |
| print("βββ train/") |
| print(" βββ regular/") |
|
|
| if __name__ == "__main__": |
| main() |
|
|