Spaces:
Runtime error
Runtime error
| import kagglehub | |
| import pandas as pd | |
| import json | |
| import os | |
| import re | |
| from typing import Dict, List | |
| # Define paths | |
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| RAW_DIR = os.path.join(BASE_DIR, "datasets", "raw") | |
| PROCESSED_DIR = os.path.join(BASE_DIR, "datasets", "processed") | |
| os.makedirs(RAW_DIR, exist_ok=True) | |
| os.makedirs(PROCESSED_DIR, exist_ok=True) | |
| def download_datasets(): | |
| datasets = { | |
| "medquad": [ | |
| "vishweshsalodkar/medquad-medical-q-a-dataset", | |
| "itachi9604/medquad-dataset" | |
| ], | |
| "symptoms": [ | |
| "kaushil268/disease-prediction-using-machine-learning", | |
| "itachi9604/disease-symptom-description-dataset" | |
| ] | |
| } | |
| paths = {"medquad": None, "symptoms": None} | |
| for key, ids in datasets.items(): | |
| for dataset_id in ids: | |
| try: | |
| print(f"Attempting to download {key} from {dataset_id}...") | |
| path = kagglehub.dataset_download(dataset_id) | |
| paths[key] = path | |
| print(f"Successfully downloaded {key} to {path}") | |
| break | |
| except Exception as e: | |
| print(f"Failed to download {dataset_id}: {e}") | |
| return paths["medquad"], paths["symptoms"] | |
| def generate_professional_fallback(): | |
| print("Generating professional fallback data...") | |
| # Symptom Index Fallback | |
| fallback_symptoms = { | |
| "chest pain": ["Myocardial Infarction", "Angina", "Pericarditis"], | |
| "fever": ["Infection", "Influenza", "COVID-19"], | |
| "headache": ["Migraine", "Tension Headache", "Cluster Headache"], | |
| "shortness of breath": ["Dyspnea", "Asthma", "Pulmonary Embolism"], | |
| "sore throat": ["Pharyngitis", "Tonsillitis"], | |
| "fatigue": ["Anemia", "Chronic Fatigue Syndrome", "Diabetes"], | |
| "nausea": ["Gastroenteritis", "Food Poisoning"], | |
| "abdominal pain": ["Appendicitis", "Gastritis", "IBS"] | |
| } | |
| # MedQuAD Fallback | |
| fallback_medquad = [ | |
| {"question": "What is Diabetes?", "answer": "Diabetes is a chronic disease that occurs when the pancreas is no longer able to make insulin, or when the body cannot make good use of the insulin it produces.", "source": "Fallback"}, | |
| {"question": "Symptoms of Influenza?", "answer": "Common symptoms include fever, cough, sore throat, muscle aches, headache, fatigue, and runny or stuffy nose.", "source": "Fallback"}, | |
| {"question": "How to treat a fever?", "answer": "Rest, drink plenty of fluids, and use over-the-counter medications like acetaminophen or ibuprofen if needed. Consult a doctor for high or persistent fever.", "source": "Fallback"} | |
| ] | |
| if not os.path.exists(os.path.join(PROCESSED_DIR, "symcat_index.json")): | |
| with open(os.path.join(PROCESSED_DIR, "symcat_index.json"), "w") as f: | |
| json.dump(fallback_symptoms, f, indent=2) | |
| if not os.path.exists(os.path.join(PROCESSED_DIR, "medquad_index.json")): | |
| with open(os.path.join(PROCESSED_DIR, "medquad_index.json"), "w") as f: | |
| json.dump(fallback_medquad, f, indent=2) | |
| def process_symptoms(symptom_path: str): | |
| if not symptom_path: | |
| return | |
| print("Processing symptom-disease mapping...") | |
| # Check for various common symptom CSV names | |
| target_files = ['dataset.csv', 'dataset_symptoms.csv', 'symptoms_df.csv'] | |
| found_file = None | |
| for f in target_files: | |
| if os.path.exists(os.path.join(symptom_path, f)): | |
| found_file = os.path.join(symptom_path, f) | |
| break | |
| if not found_file: | |
| csv_files = [f for f in os.listdir(symptom_path) if f.endswith('.csv')] | |
| if csv_files: | |
| found_file = os.path.join(symptom_path, csv_files[0]) | |
| if not found_file: | |
| print("No suitable CSV found in symptom dataset") | |
| return | |
| df = pd.read_csv(found_file) | |
| mapping = {} | |
| # Generic logic for symptom-disease CSVs (Disease in 1st col, Symptoms in others) | |
| cols = df.columns | |
| for _, row in df.iterrows(): | |
| disease = str(row[cols[0]]).strip() | |
| symptoms = [str(val).strip().replace('_', ' ') for val in row[1:] if pd.notna(val) and str(val).lower() != 'nan'] | |
| for symptom in symptoms: | |
| if symptom not in mapping: | |
| mapping[symptom] = [] | |
| if disease not in mapping[symptom]: | |
| mapping[symptom].append(disease) | |
| with open(os.path.join(PROCESSED_DIR, "symcat_index.json"), "w") as f: | |
| json.dump(mapping, f, indent=2) | |
| print(f"Symptom index saved with {len(mapping)} symptoms.") | |
| def process_medquad(medquad_path: str): | |
| if not medquad_path: | |
| return | |
| print("Processing MedQuAD Q&A...") | |
| csv_files = [f for f in os.listdir(medquad_path) if f.endswith('.csv')] | |
| if not csv_files: | |
| return | |
| df = pd.read_csv(os.path.join(medquad_path, csv_files[0])) | |
| qa_data = [] | |
| # Identify Q and A columns | |
| q_col = next((c for c in df.columns if 'question' in c.lower()), None) | |
| a_col = next((c for c in df.columns if 'answer' in c.lower()), None) | |
| if q_col and a_col: | |
| for _, row in df.iterrows(): | |
| if pd.notna(row[q_col]) and pd.notna(row[a_col]): | |
| qa_data.append({ | |
| "question": str(row[q_col]), | |
| "answer": str(row[a_col]), | |
| "source": "MedQuAD" | |
| }) | |
| with open(os.path.join(PROCESSED_DIR, "medquad_index.json"), "w") as f: | |
| json.dump(qa_data, f, indent=2) | |
| print(f"MedQuAD index saved with {len(qa_data)} Q&A pairs.") | |
| def create_normalization_map(): | |
| print("Creating normalization map...") | |
| normalization = { | |
| "headache": ["migraine", "head pain", "throbbing head", "sar dard"], | |
| "fever": ["high temperature", "feverish", "bukhar", "tap"], | |
| "cough": ["coughing", "dry cough", "khansi"], | |
| "chest pain": ["pain in chest", "chest tightness", "seenay mein dard"], | |
| "shortness of breath": ["difficulty breathing", "breathless", "saans lene mein dushwari", "saans phoolna"], | |
| "sore throat": ["throat pain", "gala kharab", "gulay mein dard"], | |
| "fatigue": ["tiredness", "exhaustion", "weakness", "thakawat", "kamzori"], | |
| "nausea": ["feeling sick", "vomit sensation", "matli"], | |
| "diarrhea": ["loose motions", "paichish"], | |
| "abdominal pain": ["stomach ache", "belly pain", "pait mein dard"] | |
| } | |
| with open(os.path.join(PROCESSED_DIR, "normalization_map.json"), "w") as f: | |
| json.dump(normalization, f, indent=2) | |
| print("Normalization map saved.") | |
| if __name__ == "__main__": | |
| m_path, s_path = download_datasets() | |
| process_symptoms(s_path) | |
| process_medquad(m_path) | |
| generate_professional_fallback() # Ensure something is there | |
| create_normalization_map() | |
| print("Preprocessing complete!") | |