import kagglehub import os import json import logging import pandas as pd from typing import List, Dict # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Paths BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DATASETS_DIR = os.path.join(BASE_DIR, "datasets") RAW_DIR = os.path.join(DATASETS_DIR, "raw") PROCESSED_DIR = os.path.join(DATASETS_DIR, "processed") os.makedirs(RAW_DIR, exist_ok=True) os.makedirs(PROCESSED_DIR, exist_ok=True) class MedicalDataFetcher: """Fetches professional medical datasets using kagglehub and public sources""" def __init__(self): self.dataset_map = { "medquad": "vishweshsalodkar/medquad-medical-q-a-dataset", "healthtap": "itachi9604/healthtap-dataset", # Community curated "disease_symptoms": "kaushil268/disease-prediction-using-machine-learning" } def fetch_all(self): """Fetch all mapped datasets""" results = {} for name, kg_id in self.dataset_map.items(): try: logger.info(f"Fetching {name} from Kaggle: {kg_id}...") path = kagglehub.dataset_download(kg_id) results[name] = path logger.info(f"✓ {name} available at {path}") except Exception as e: logger.error(f"✗ Failed to fetch {name}: {e}") return results def verify_local_data(self): """Check for existing data in standard paths""" paths = { "medquad_local": os.path.join(DATASETS_DIR, "medquad", "sample_data.json"), "symcat_local": os.path.join(DATASETS_DIR, "symcat", "symptoms.json") } for name, path in paths.items(): if os.path.exists(path): logger.info(f"✓ Found local data: {name} at {path}") else: logger.warning(f"! Missing local data: {name}") if __name__ == "__main__": fetcher = MedicalDataFetcher() fetcher.verify_local_data() # In a real environment, we'd run fetch_all() but since we are in a sandbox # we will prioritize processing what's available and providing placeholders # for the user to populate if Kaggle is restricted. print("\n--- Summary ---") print("Fetch script ready. Run this locally to populate 'datasets/raw' if data is missing.")