customeragent-api / server /scripts /setup_datasets.sh
anasraza526's picture
Clean deploy to Hugging Face
ac90985
#!/bin/bash
# Dataset Setup Script
# This script downloads and sets up production datasets
echo "πŸš€ Setting up production datasets..."
# Create datasets directory
mkdir -p datasets
cd datasets
echo ""
echo "πŸ“¦ 1. Creating CourseQ Dataset (Custom)"
mkdir -p courseq
# CourseQ is already created by our system - no download needed
echo "βœ… CourseQ ready"
echo ""
echo "πŸ“¦ 2. Downloading CLINC150 Intent Dataset"
mkdir -p clinc150
curl -o clinc150/data.json https://raw.githubusercontent.com/clinc/oos-eval/master/data/data_full.json
echo "βœ… CLINC150 downloaded"
echo ""
echo "πŸ“¦ 3. Setting up MedQuAD (Manual Download Required)"
echo "⚠️ MedQuAD requires manual setup:"
echo " 1. Visit: https://github.com/abachaa/MedQuAD"
echo " 2. Download the repository"
echo " 3. Extract to: datasets/medquad/"
echo ""
echo ""
echo "πŸ“¦ 4. Setting up SymCAT (Manual Download Required)"
echo "⚠️ SymCAT requires manual setup:"
echo " 1. Visit: https://github.com/symcat/symcat-corpus"
echo " 2. Download symptoms.json"
echo " 3. Place in: datasets/symcat/symptoms.json"
echo ""
echo ""
echo "πŸ“¦ 5. Setting up Roman Urdu Corpus (Manual Download Required)"
echo "⚠️ Roman Urdu corpus requires manual setup:"
echo " 1. Search for Roman Urdu parallel corpus datasets"
echo " 2. Place CSV in: datasets/roman_urdu_corpus/data.csv"
echo ""
echo ""
echo "βœ… Automated setup complete!"
echo ""
echo "πŸ“ Next Steps:"
echo " 1. Follow manual setup instructions above for remaining datasets"
echo " 2. Restart your server"
echo " 3. Run: python test_enhanced_modules.py"
echo ""
echo "πŸ’‘ Note: System works fine with sample data for development!"