version: "3.8" services: api: build: context: . dockerfile: Dockerfile.prod ports: - "8000:8000" environment: - PORT=8000 - WORKERS=4 - REDIS_URL=redis://redis:6379 - DATABASE_URL=postgresql://ml_intern:ml_intern@postgres:5432/ml_intern - MAX_CONCURRENT_REQUESTS=200 - DEFAULT_RPM_LIMIT=40 - REQUEST_TIMEOUT=120 - CACHE_TTL_SECONDS=300 - BUDGET_USD_PER_SESSION=10.0 - CIRCUIT_BREAKER_FAILURE_THRESHOLD=5 - CIRCUIT_BREAKER_RECOVERY_TIMEOUT=60 - HF_TOKEN=${HF_TOKEN} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - OPENAI_API_KEY=${OPENAI_API_KEY:-} - GROQ_API_KEY=${GROQ_API_KEY:-} - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} - LOG_LEVEL=INFO depends_on: redis: condition: service_healthy postgres: condition: service_healthy networks: - ml_intern_network deploy: replicas: 2 resources: limits: cpus: '4' memory: 4G healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s restart: unless-stopped worker: build: context: . dockerfile: Dockerfile.prod command: ["python", "-m", "worker"] environment: - REDIS_URL=redis://redis:6379 - DATABASE_URL=postgresql://ml_intern:ml_intern@postgres:5432/ml_intern - LOG_LEVEL=INFO depends_on: redis: condition: service_healthy postgres: condition: service_healthy networks: - ml_intern_network deploy: replicas: 2 resources: limits: cpus: '2' memory: 2G restart: unless-stopped redis: image: redis:7-alpine ports: - "6379:6379" volumes: - redis_data:/data command: redis-server --appendonly yes --maxmemory 1gb --maxmemory-policy allkeys-lru networks: - ml_intern_network healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 10s timeout: 3s retries: 3 restart: unless-stopped postgres: image: postgres:16-alpine ports: - "5432:5432" environment: - POSTGRES_USER=ml_intern - POSTGRES_PASSWORD=ml_intern - POSTGRES_DB=ml_intern volumes: - postgres_data:/var/lib/postgresql/data - ./init.sql:/docker-entrypoint-initdb.d/init.sql:ro networks: - ml_intern_network healthcheck: test: ["CMD-SHELL", "pg_isready -U ml_intern"] interval: 10s timeout: 3s retries: 5 restart: unless-stopped nginx: image: nginx:alpine ports: - "80:80" - "443:443" volumes: - ./nginx.conf:/etc/nginx/nginx.conf:ro - ./ssl:/etc/nginx/ssl:ro depends_on: - api networks: - ml_intern_network restart: unless-stopped prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus_data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=30d' - '--web.enable-lifecycle' networks: - ml_intern_network restart: unless-stopped grafana: image: grafana/grafana:latest ports: - "3000:3000" environment: - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin} - GF_INSTALL_PLUGINS=grafana-piechart-panel volumes: - grafana_data:/var/lib/grafana - ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro - ./grafana/datasources:/etc/grafana/provisioning/datasources:ro depends_on: - prometheus networks: - ml_intern_network restart: unless-stopped jaeger: image: jaegertracing/all-in-one:latest ports: - "16686:16686" - "14268:14268" environment: - COLLECTOR_OTLP_ENABLED=true networks: - ml_intern_network restart: unless-stopped pgadmin: image: dpage/pgadmin4:latest ports: - "5050:80" environment: - PGADMIN_DEFAULT_EMAIL=admin@mlintern.local - PGADMIN_DEFAULT_PASSWORD=${PGADMIN_PASSWORD:-admin} depends_on: - postgres networks: - ml_intern_network restart: unless-stopped volumes: redis_data: postgres_data: prometheus_data: grafana_data: networks: ml_intern_network: driver: bridge