| |
| """ |
| Test script to check Hugging Face connectivity and provide solutions |
| """ |
|
|
| import requests |
| import os |
| from pathlib import Path |
|
|
| def test_huggingface_connectivity(): |
| """Test connection to Hugging Face""" |
| print("π Testing Hugging Face connectivity...") |
| |
| try: |
| response = requests.get("https://huggingface.co", timeout=10) |
| if response.status_code == 200: |
| print("β
Hugging Face is accessible") |
| return True |
| else: |
| print(f"β οΈ Hugging Face returned status code: {response.status_code}") |
| return False |
| except requests.exceptions.Timeout: |
| print("β Connection to Hugging Face timed out") |
| return False |
| except requests.exceptions.ConnectionError: |
| print("β Cannot connect to Hugging Face") |
| return False |
| except Exception as e: |
| print(f"β Error connecting to Hugging Face: {e}") |
| return False |
|
|
| def check_cached_models(): |
| """Check if T5 models are already cached""" |
| print("\nπ Checking for cached models...") |
| |
| |
| cache_locations = [ |
| Path.home() / ".cache" / "huggingface" / "transformers", |
| Path.home() / ".cache" / "huggingface" / "hub", |
| Path(os.environ.get("HF_HOME", "")) / "hub" if os.environ.get("HF_HOME") else None, |
| ] |
| |
| found_models = [] |
| for cache_dir in cache_locations: |
| if cache_dir and cache_dir.exists(): |
| |
| for item in cache_dir.iterdir(): |
| if item.is_dir() and "t5" in item.name.lower(): |
| found_models.append(str(item)) |
| print(f"β
Found cached model: {item}") |
| |
| if not found_models: |
| print("β No T5 models found in cache") |
| |
| return found_models |
|
|
| def suggest_solutions(): |
| """Provide solutions for connectivity issues""" |
| print("\nπ‘ Solutions for connectivity issues:") |
| print("="*50) |
| |
| print("\n1. π **Pre-download the model with better connectivity:**") |
| print(" Run this when you have stable internet:") |
| print(" ```python") |
| print(" from transformers import AutoTokenizer, AutoModelForSeq2SeqLM") |
| print(" tokenizer = AutoTokenizer.from_pretrained('t5-base')") |
| print(" model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')") |
| print(" ```") |
| |
| print("\n2. π **Retry with longer timeout:**") |
| print(" Set environment variables:") |
| print(" ```bash") |
| print(" export HF_HUB_TIMEOUT=300") |
| print(" export REQUESTS_TIMEOUT=300") |
| print(" ```") |
| |
| print("\n3. π **Use offline mode (if model is cached):**") |
| print(" ```bash") |
| print(" export TRANSFORMERS_OFFLINE=1") |
| print(" ```") |
| |
| print("\n4. π **Alternative: Use different mirror:**") |
| print(" ```bash") |
| print(" export HF_ENDPOINT=https://hf-mirror.com") |
| print(" ```") |
| |
| print("\n5. π¦ **Local testing without model download:**") |
| print(" Use a smaller test that doesn't require model downloads") |
|
|
| def create_simple_test(): |
| """Create a simple test that doesn't require model downloads""" |
| print("\nπ§ͺ Creating simplified test...") |
| |
| test_script = '''#!/usr/bin/env python3 |
| """ |
| Simple test that only tests data loading and GPU monitoring without model downloads |
| """ |
| |
| import sys |
| import os |
| sys.path.append('src') |
| |
| def test_data_only(): |
| """Test only data loading functionality""" |
| try: |
| import pandas as pd |
| from tevatron.utils.gpu_monitor import GPUMemoryMonitor |
| |
| print("β
Testing data loading...") |
| df = pd.read_csv("data/the_vault/DOC_VAULT_train.tsv", sep='\\t', nrows=5) |
| print(f"β
Loaded {len(df)} samples") |
| |
| print("β
Testing GPU monitor...") |
| monitor = GPUMemoryMonitor(memory_threshold=0.8, check_interval=10) |
| stats = monitor.get_memory_stats() |
| print(f"β
GPU monitor initialized: {stats}") |
| |
| print("π Basic functionality test PASSED!") |
| return True |
| |
| except Exception as e: |
| print(f"β Test failed: {e}") |
| return False |
| |
| if __name__ == "__main__": |
| success = test_data_only() |
| sys.exit(0 if success else 1) |
| ''' |
| |
| with open("scripts/test_basic.py", "w") as f: |
| f.write(test_script) |
| |
| print("β
Created scripts/test_basic.py") |
| print(" Run with: python scripts/test_basic.py") |
|
|
| def main(): |
| print("π GLEN Connectivity Diagnostic") |
| print("="*40) |
| |
| |
| connectivity_ok = test_huggingface_connectivity() |
| |
| |
| cached_models = check_cached_models() |
| |
| |
| create_simple_test() |
| |
| |
| suggest_solutions() |
| |
| print("\n" + "="*50) |
| print("π Summary:") |
| print(f" - Hugging Face connectivity: {'β
OK' if connectivity_ok else 'β FAILED'}") |
| print(f" - Cached models found: {'β
YES' if cached_models else 'β NO'}") |
| print(" - Simple test created: β
YES") |
| |
| if not connectivity_ok and not cached_models: |
| print("\nβ οΈ **Action needed:** Either fix connectivity or pre-download models") |
| print(" Try running: python scripts/test_basic.py (for basic functionality)") |
| elif cached_models: |
| print("\nβ
**Good news:** You have cached models. Try offline mode!") |
| print(" Set: export TRANSFORMERS_OFFLINE=1") |
| else: |
| print("\nβ
**All good:** You should be able to run full training!") |
|
|
| if __name__ == "__main__": |
| main() |