Spaces:

anasraza526
/

customeragent-api

Runtime error

App Files Files Community

customeragent-api / server /scripts /fetch_datasets.py

anasraza526

Clean deploy to Hugging Face

ac90985 20 days ago

raw

history blame contribute delete

2.42 kB

	import kagglehub
	import os
	import json
	import logging
	import pandas as pd
	from typing import List, Dict

	# Setup logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Paths
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	DATASETS_DIR = os.path.join(BASE_DIR, "datasets")
	RAW_DIR = os.path.join(DATASETS_DIR, "raw")
	PROCESSED_DIR = os.path.join(DATASETS_DIR, "processed")

	os.makedirs(RAW_DIR, exist_ok=True)
	os.makedirs(PROCESSED_DIR, exist_ok=True)

	class MedicalDataFetcher:
	"""Fetches professional medical datasets using kagglehub and public sources"""

	def __init__(self):
	self.dataset_map = {
	"medquad": "vishweshsalodkar/medquad-medical-q-a-dataset",
	"healthtap": "itachi9604/healthtap-dataset", # Community curated
	"disease_symptoms": "kaushil268/disease-prediction-using-machine-learning"
	}

	def fetch_all(self):
	"""Fetch all mapped datasets"""
	results = {}
	for name, kg_id in self.dataset_map.items():
	try:
	logger.info(f"Fetching {name} from Kaggle: {kg_id}...")
	path = kagglehub.dataset_download(kg_id)
	results[name] = path
	logger.info(f"✓ {name} available at {path}")
	except Exception as e:
	logger.error(f"✗ Failed to fetch {name}: {e}")
	return results

	def verify_local_data(self):
	"""Check for existing data in standard paths"""
	paths = {
	"medquad_local": os.path.join(DATASETS_DIR, "medquad", "sample_data.json"),
	"symcat_local": os.path.join(DATASETS_DIR, "symcat", "symptoms.json")
	}
	for name, path in paths.items():
	if os.path.exists(path):
	logger.info(f"✓ Found local data: {name} at {path}")
	else:
	logger.warning(f"! Missing local data: {name}")

	if __name__ == "__main__":
	fetcher = MedicalDataFetcher()
	fetcher.verify_local_data()
	# In a real environment, we'd run fetch_all() but since we are in a sandbox
	# we will prioritize processing what's available and providing placeholders
	# for the user to populate if Kaggle is restricted.
	print("\n--- Summary ---")
	print("Fetch script ready. Run this locally to populate 'datasets/raw' if data is missing.")