customeragent-api / server /scripts /consolidate_medquad.py
anasraza526's picture
Clean deploy to Hugging Face
ac90985
import os
import xml.etree.ElementTree as ET
import json
import logging
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MEDQUAD_DIR = os.path.join(BASE_DIR, "datasets", "raw", "medquad")
OUTPUT_FILE = os.path.join(MEDQUAD_DIR, "medquad_all.json")
def consolidate_xmls():
if not os.path.exists(MEDQUAD_DIR):
logger.error(f"Directory not found: {MEDQUAD_DIR}")
return
all_records = []
xml_files = [f for f in os.listdir(MEDQUAD_DIR) if f.endswith('.xml')]
logger.info(f"πŸš€ Found {len(xml_files)} XML files. Starting consolidation...")
for i, filename in enumerate(xml_files):
file_path = os.path.join(MEDQUAD_DIR, filename)
try:
tree = ET.parse(file_path)
root = tree.getroot()
focus = root.findtext('Focus') or ""
qa_pairs_elem = root.find('QAPairs')
if qa_pairs_elem is not None:
for qa_pair in qa_pairs_elem.findall('QAPair'):
question = qa_pair.findtext('Question')
answer = qa_pair.findtext('Answer')
if question and answer:
all_records.append({
"question": question.strip(),
"answer": answer.strip(),
"focus": focus.strip(),
"source": "MedQuAD"
})
except Exception as e:
logger.error(f"Failed to parse {filename}: {e}")
if (i + 1) % 500 == 0:
logger.info(f" Processed {i + 1} files...")
if all_records:
logger.info(f"πŸ“ Saving {len(all_records)} records to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(all_records, f, indent=2)
logger.info("βœ… Consolidation complete!")
return True
else:
logger.warning("No records found to consolidate.")
return False
def cleanup_xmls():
logger.info("πŸ—‘οΈ Cleaning up XML files...")
xml_files = [f for f in os.listdir(MEDQUAD_DIR) if f.endswith('.xml')]
for filename in xml_files:
try:
os.remove(os.path.join(MEDQUAD_DIR, filename))
except Exception as e:
logger.error(f"Failed to delete {filename}: {e}")
logger.info(f"βœ… Deleted {len(xml_files)} XML files.")
if __name__ == "__main__":
if consolidate_xmls():
# Verify JSON exists and has content before deleting
if os.path.exists(OUTPUT_FILE) and os.path.getsize(OUTPUT_FILE) > 1000:
cleanup_xmls()
else:
logger.error("❌ Verification failed. JSON file is missing or too small. Aborting cleanup.")