Spaces:
Runtime error
Runtime error
| import os | |
| import xml.etree.ElementTree as ET | |
| import json | |
| import logging | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| MEDQUAD_DIR = os.path.join(BASE_DIR, "datasets", "raw", "medquad") | |
| OUTPUT_FILE = os.path.join(MEDQUAD_DIR, "medquad_all.json") | |
| def consolidate_xmls(): | |
| if not os.path.exists(MEDQUAD_DIR): | |
| logger.error(f"Directory not found: {MEDQUAD_DIR}") | |
| return | |
| all_records = [] | |
| xml_files = [f for f in os.listdir(MEDQUAD_DIR) if f.endswith('.xml')] | |
| logger.info(f"π Found {len(xml_files)} XML files. Starting consolidation...") | |
| for i, filename in enumerate(xml_files): | |
| file_path = os.path.join(MEDQUAD_DIR, filename) | |
| try: | |
| tree = ET.parse(file_path) | |
| root = tree.getroot() | |
| focus = root.findtext('Focus') or "" | |
| qa_pairs_elem = root.find('QAPairs') | |
| if qa_pairs_elem is not None: | |
| for qa_pair in qa_pairs_elem.findall('QAPair'): | |
| question = qa_pair.findtext('Question') | |
| answer = qa_pair.findtext('Answer') | |
| if question and answer: | |
| all_records.append({ | |
| "question": question.strip(), | |
| "answer": answer.strip(), | |
| "focus": focus.strip(), | |
| "source": "MedQuAD" | |
| }) | |
| except Exception as e: | |
| logger.error(f"Failed to parse {filename}: {e}") | |
| if (i + 1) % 500 == 0: | |
| logger.info(f" Processed {i + 1} files...") | |
| if all_records: | |
| logger.info(f"π Saving {len(all_records)} records to {OUTPUT_FILE}...") | |
| with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: | |
| json.dump(all_records, f, indent=2) | |
| logger.info("β Consolidation complete!") | |
| return True | |
| else: | |
| logger.warning("No records found to consolidate.") | |
| return False | |
| def cleanup_xmls(): | |
| logger.info("ποΈ Cleaning up XML files...") | |
| xml_files = [f for f in os.listdir(MEDQUAD_DIR) if f.endswith('.xml')] | |
| for filename in xml_files: | |
| try: | |
| os.remove(os.path.join(MEDQUAD_DIR, filename)) | |
| except Exception as e: | |
| logger.error(f"Failed to delete {filename}: {e}") | |
| logger.info(f"β Deleted {len(xml_files)} XML files.") | |
| if __name__ == "__main__": | |
| if consolidate_xmls(): | |
| # Verify JSON exists and has content before deleting | |
| if os.path.exists(OUTPUT_FILE) and os.path.getsize(OUTPUT_FILE) > 1000: | |
| cleanup_xmls() | |
| else: | |
| logger.error("β Verification failed. JSON file is missing or too small. Aborting cleanup.") | |