customeragent-api / server /scripts /prepare_healthcare_data.py
anasraza526's picture
Clean deploy to Hugging Face
ac90985
import kagglehub
import pandas as pd
import json
import os
import re
from typing import Dict, List
# Define paths
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
RAW_DIR = os.path.join(BASE_DIR, "datasets", "raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "datasets", "processed")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)
def download_datasets():
datasets = {
"medquad": [
"vishweshsalodkar/medquad-medical-q-a-dataset",
"itachi9604/medquad-dataset"
],
"symptoms": [
"kaushil268/disease-prediction-using-machine-learning",
"itachi9604/disease-symptom-description-dataset"
]
}
paths = {"medquad": None, "symptoms": None}
for key, ids in datasets.items():
for dataset_id in ids:
try:
print(f"Attempting to download {key} from {dataset_id}...")
path = kagglehub.dataset_download(dataset_id)
paths[key] = path
print(f"Successfully downloaded {key} to {path}")
break
except Exception as e:
print(f"Failed to download {dataset_id}: {e}")
return paths["medquad"], paths["symptoms"]
def generate_professional_fallback():
print("Generating professional fallback data...")
# Symptom Index Fallback
fallback_symptoms = {
"chest pain": ["Myocardial Infarction", "Angina", "Pericarditis"],
"fever": ["Infection", "Influenza", "COVID-19"],
"headache": ["Migraine", "Tension Headache", "Cluster Headache"],
"shortness of breath": ["Dyspnea", "Asthma", "Pulmonary Embolism"],
"sore throat": ["Pharyngitis", "Tonsillitis"],
"fatigue": ["Anemia", "Chronic Fatigue Syndrome", "Diabetes"],
"nausea": ["Gastroenteritis", "Food Poisoning"],
"abdominal pain": ["Appendicitis", "Gastritis", "IBS"]
}
# MedQuAD Fallback
fallback_medquad = [
{"question": "What is Diabetes?", "answer": "Diabetes is a chronic disease that occurs when the pancreas is no longer able to make insulin, or when the body cannot make good use of the insulin it produces.", "source": "Fallback"},
{"question": "Symptoms of Influenza?", "answer": "Common symptoms include fever, cough, sore throat, muscle aches, headache, fatigue, and runny or stuffy nose.", "source": "Fallback"},
{"question": "How to treat a fever?", "answer": "Rest, drink plenty of fluids, and use over-the-counter medications like acetaminophen or ibuprofen if needed. Consult a doctor for high or persistent fever.", "source": "Fallback"}
]
if not os.path.exists(os.path.join(PROCESSED_DIR, "symcat_index.json")):
with open(os.path.join(PROCESSED_DIR, "symcat_index.json"), "w") as f:
json.dump(fallback_symptoms, f, indent=2)
if not os.path.exists(os.path.join(PROCESSED_DIR, "medquad_index.json")):
with open(os.path.join(PROCESSED_DIR, "medquad_index.json"), "w") as f:
json.dump(fallback_medquad, f, indent=2)
def process_symptoms(symptom_path: str):
if not symptom_path:
return
print("Processing symptom-disease mapping...")
# Check for various common symptom CSV names
target_files = ['dataset.csv', 'dataset_symptoms.csv', 'symptoms_df.csv']
found_file = None
for f in target_files:
if os.path.exists(os.path.join(symptom_path, f)):
found_file = os.path.join(symptom_path, f)
break
if not found_file:
csv_files = [f for f in os.listdir(symptom_path) if f.endswith('.csv')]
if csv_files:
found_file = os.path.join(symptom_path, csv_files[0])
if not found_file:
print("No suitable CSV found in symptom dataset")
return
df = pd.read_csv(found_file)
mapping = {}
# Generic logic for symptom-disease CSVs (Disease in 1st col, Symptoms in others)
cols = df.columns
for _, row in df.iterrows():
disease = str(row[cols[0]]).strip()
symptoms = [str(val).strip().replace('_', ' ') for val in row[1:] if pd.notna(val) and str(val).lower() != 'nan']
for symptom in symptoms:
if symptom not in mapping:
mapping[symptom] = []
if disease not in mapping[symptom]:
mapping[symptom].append(disease)
with open(os.path.join(PROCESSED_DIR, "symcat_index.json"), "w") as f:
json.dump(mapping, f, indent=2)
print(f"Symptom index saved with {len(mapping)} symptoms.")
def process_medquad(medquad_path: str):
if not medquad_path:
return
print("Processing MedQuAD Q&A...")
csv_files = [f for f in os.listdir(medquad_path) if f.endswith('.csv')]
if not csv_files:
return
df = pd.read_csv(os.path.join(medquad_path, csv_files[0]))
qa_data = []
# Identify Q and A columns
q_col = next((c for c in df.columns if 'question' in c.lower()), None)
a_col = next((c for c in df.columns if 'answer' in c.lower()), None)
if q_col and a_col:
for _, row in df.iterrows():
if pd.notna(row[q_col]) and pd.notna(row[a_col]):
qa_data.append({
"question": str(row[q_col]),
"answer": str(row[a_col]),
"source": "MedQuAD"
})
with open(os.path.join(PROCESSED_DIR, "medquad_index.json"), "w") as f:
json.dump(qa_data, f, indent=2)
print(f"MedQuAD index saved with {len(qa_data)} Q&A pairs.")
def create_normalization_map():
print("Creating normalization map...")
normalization = {
"headache": ["migraine", "head pain", "throbbing head", "sar dard"],
"fever": ["high temperature", "feverish", "bukhar", "tap"],
"cough": ["coughing", "dry cough", "khansi"],
"chest pain": ["pain in chest", "chest tightness", "seenay mein dard"],
"shortness of breath": ["difficulty breathing", "breathless", "saans lene mein dushwari", "saans phoolna"],
"sore throat": ["throat pain", "gala kharab", "gulay mein dard"],
"fatigue": ["tiredness", "exhaustion", "weakness", "thakawat", "kamzori"],
"nausea": ["feeling sick", "vomit sensation", "matli"],
"diarrhea": ["loose motions", "paichish"],
"abdominal pain": ["stomach ache", "belly pain", "pait mein dard"]
}
with open(os.path.join(PROCESSED_DIR, "normalization_map.json"), "w") as f:
json.dump(normalization, f, indent=2)
print("Normalization map saved.")
if __name__ == "__main__":
m_path, s_path = download_datasets()
process_symptoms(s_path)
process_medquad(m_path)
generate_professional_fallback() # Ensure something is there
create_normalization_map()
print("Preprocessing complete!")