Spaces:
Runtime error
Runtime error
| import kagglehub | |
| import os | |
| import json | |
| import logging | |
| import pandas as pd | |
| from typing import List, Dict | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Paths | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATASETS_DIR = os.path.join(BASE_DIR, "datasets") | |
| RAW_DIR = os.path.join(DATASETS_DIR, "raw") | |
| PROCESSED_DIR = os.path.join(DATASETS_DIR, "processed") | |
| os.makedirs(RAW_DIR, exist_ok=True) | |
| os.makedirs(PROCESSED_DIR, exist_ok=True) | |
| class MedicalDataFetcher: | |
| """Fetches professional medical datasets using kagglehub and public sources""" | |
| def __init__(self): | |
| self.dataset_map = { | |
| "medquad": "vishweshsalodkar/medquad-medical-q-a-dataset", | |
| "healthtap": "itachi9604/healthtap-dataset", # Community curated | |
| "disease_symptoms": "kaushil268/disease-prediction-using-machine-learning" | |
| } | |
| def fetch_all(self): | |
| """Fetch all mapped datasets""" | |
| results = {} | |
| for name, kg_id in self.dataset_map.items(): | |
| try: | |
| logger.info(f"Fetching {name} from Kaggle: {kg_id}...") | |
| path = kagglehub.dataset_download(kg_id) | |
| results[name] = path | |
| logger.info(f"✓ {name} available at {path}") | |
| except Exception as e: | |
| logger.error(f"✗ Failed to fetch {name}: {e}") | |
| return results | |
| def verify_local_data(self): | |
| """Check for existing data in standard paths""" | |
| paths = { | |
| "medquad_local": os.path.join(DATASETS_DIR, "medquad", "sample_data.json"), | |
| "symcat_local": os.path.join(DATASETS_DIR, "symcat", "symptoms.json") | |
| } | |
| for name, path in paths.items(): | |
| if os.path.exists(path): | |
| logger.info(f"✓ Found local data: {name} at {path}") | |
| else: | |
| logger.warning(f"! Missing local data: {name}") | |
| if __name__ == "__main__": | |
| fetcher = MedicalDataFetcher() | |
| fetcher.verify_local_data() | |
| # In a real environment, we'd run fetch_all() but since we are in a sandbox | |
| # we will prioritize processing what's available and providing placeholders | |
| # for the user to populate if Kaggle is restricted. | |
| print("\n--- Summary ---") | |
| print("Fetch script ready. Run this locally to populate 'datasets/raw' if data is missing.") | |