Spaces:

anasraza526
/

customeragent-api

Runtime error

App Files Files Community

customeragent-api / server /scripts /prepare_healthcare_data.py

anasraza526

Clean deploy to Hugging Face

ac90985 23 days ago

raw

history blame contribute delete

6.96 kB

	import kagglehub
	import pandas as pd
	import json
	import os
	import re
	from typing import Dict, List

	# Define paths
	BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	RAW_DIR = os.path.join(BASE_DIR, "datasets", "raw")
	PROCESSED_DIR = os.path.join(BASE_DIR, "datasets", "processed")

	os.makedirs(RAW_DIR, exist_ok=True)
	os.makedirs(PROCESSED_DIR, exist_ok=True)

	def download_datasets():
	datasets = {
	"medquad": [
	"vishweshsalodkar/medquad-medical-q-a-dataset",
	"itachi9604/medquad-dataset"
	],
	"symptoms": [
	"kaushil268/disease-prediction-using-machine-learning",
	"itachi9604/disease-symptom-description-dataset"
	]
	}

	paths = {"medquad": None, "symptoms": None}

	for key, ids in datasets.items():
	for dataset_id in ids:
	try:
	print(f"Attempting to download {key} from {dataset_id}...")
	path = kagglehub.dataset_download(dataset_id)
	paths[key] = path
	print(f"Successfully downloaded {key} to {path}")
	break
	except Exception as e:
	print(f"Failed to download {dataset_id}: {e}")

	return paths["medquad"], paths["symptoms"]

	def generate_professional_fallback():
	print("Generating professional fallback data...")
	# Symptom Index Fallback
	fallback_symptoms = {
	"chest pain": ["Myocardial Infarction", "Angina", "Pericarditis"],
	"fever": ["Infection", "Influenza", "COVID-19"],
	"headache": ["Migraine", "Tension Headache", "Cluster Headache"],
	"shortness of breath": ["Dyspnea", "Asthma", "Pulmonary Embolism"],
	"sore throat": ["Pharyngitis", "Tonsillitis"],
	"fatigue": ["Anemia", "Chronic Fatigue Syndrome", "Diabetes"],
	"nausea": ["Gastroenteritis", "Food Poisoning"],
	"abdominal pain": ["Appendicitis", "Gastritis", "IBS"]
	}

	# MedQuAD Fallback
	fallback_medquad = [
	{"question": "What is Diabetes?", "answer": "Diabetes is a chronic disease that occurs when the pancreas is no longer able to make insulin, or when the body cannot make good use of the insulin it produces.", "source": "Fallback"},
	{"question": "Symptoms of Influenza?", "answer": "Common symptoms include fever, cough, sore throat, muscle aches, headache, fatigue, and runny or stuffy nose.", "source": "Fallback"},
	{"question": "How to treat a fever?", "answer": "Rest, drink plenty of fluids, and use over-the-counter medications like acetaminophen or ibuprofen if needed. Consult a doctor for high or persistent fever.", "source": "Fallback"}
	]

	if not os.path.exists(os.path.join(PROCESSED_DIR, "symcat_index.json")):
	with open(os.path.join(PROCESSED_DIR, "symcat_index.json"), "w") as f:
	json.dump(fallback_symptoms, f, indent=2)

	if not os.path.exists(os.path.join(PROCESSED_DIR, "medquad_index.json")):
	with open(os.path.join(PROCESSED_DIR, "medquad_index.json"), "w") as f:
	json.dump(fallback_medquad, f, indent=2)

	def process_symptoms(symptom_path: str):
	if not symptom_path:
	return
	print("Processing symptom-disease mapping...")
	# Check for various common symptom CSV names
	target_files = ['dataset.csv', 'dataset_symptoms.csv', 'symptoms_df.csv']
	found_file = None
	for f in target_files:
	if os.path.exists(os.path.join(symptom_path, f)):
	found_file = os.path.join(symptom_path, f)
	break

	if not found_file:
	csv_files = [f for f in os.listdir(symptom_path) if f.endswith('.csv')]
	if csv_files:
	found_file = os.path.join(symptom_path, csv_files[0])

	if not found_file:
	print("No suitable CSV found in symptom dataset")
	return

	df = pd.read_csv(found_file)
	mapping = {}

	# Generic logic for symptom-disease CSVs (Disease in 1st col, Symptoms in others)
	cols = df.columns
	for _, row in df.iterrows():
	disease = str(row[cols[0]]).strip()
	symptoms = [str(val).strip().replace('_', ' ') for val in row[1:] if pd.notna(val) and str(val).lower() != 'nan']

	for symptom in symptoms:
	if symptom not in mapping:
	mapping[symptom] = []
	if disease not in mapping[symptom]:
	mapping[symptom].append(disease)

	with open(os.path.join(PROCESSED_DIR, "symcat_index.json"), "w") as f:
	json.dump(mapping, f, indent=2)
	print(f"Symptom index saved with {len(mapping)} symptoms.")

	def process_medquad(medquad_path: str):
	if not medquad_path:
	return
	print("Processing MedQuAD Q&A...")
	csv_files = [f for f in os.listdir(medquad_path) if f.endswith('.csv')]
	if not csv_files:
	return

	df = pd.read_csv(os.path.join(medquad_path, csv_files[0]))
	qa_data = []
	# Identify Q and A columns
	q_col = next((c for c in df.columns if 'question' in c.lower()), None)
	a_col = next((c for c in df.columns if 'answer' in c.lower()), None)

	if q_col and a_col:
	for _, row in df.iterrows():
	if pd.notna(row[q_col]) and pd.notna(row[a_col]):
	qa_data.append({
	"question": str(row[q_col]),
	"answer": str(row[a_col]),
	"source": "MedQuAD"
	})

	with open(os.path.join(PROCESSED_DIR, "medquad_index.json"), "w") as f:
	json.dump(qa_data, f, indent=2)
	print(f"MedQuAD index saved with {len(qa_data)} Q&A pairs.")

	def create_normalization_map():
	print("Creating normalization map...")
	normalization = {
	"headache": ["migraine", "head pain", "throbbing head", "sar dard"],
	"fever": ["high temperature", "feverish", "bukhar", "tap"],
	"cough": ["coughing", "dry cough", "khansi"],
	"chest pain": ["pain in chest", "chest tightness", "seenay mein dard"],
	"shortness of breath": ["difficulty breathing", "breathless", "saans lene mein dushwari", "saans phoolna"],
	"sore throat": ["throat pain", "gala kharab", "gulay mein dard"],
	"fatigue": ["tiredness", "exhaustion", "weakness", "thakawat", "kamzori"],
	"nausea": ["feeling sick", "vomit sensation", "matli"],
	"diarrhea": ["loose motions", "paichish"],
	"abdominal pain": ["stomach ache", "belly pain", "pait mein dard"]
	}

	with open(os.path.join(PROCESSED_DIR, "normalization_map.json"), "w") as f:
	json.dump(normalization, f, indent=2)
	print("Normalization map saved.")

	if __name__ == "__main__":
	m_path, s_path = download_datasets()
	process_symptoms(s_path)
	process_medquad(m_path)
	generate_professional_fallback() # Ensure something is there
	create_normalization_map()
	print("Preprocessing complete!")