apiVersion: apps/v1 kind: Deployment metadata: name: ml-intern-api namespace: ml-intern labels: app: ml-intern-api version: v1 spec: replicas: 2 strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 maxUnavailable: 0 selector: matchLabels: app: ml-intern-api template: metadata: labels: app: ml-intern-api version: v1 annotations: prometheus.io/scrape: "true" prometheus.io/port: "8000" prometheus.io/path: "/metrics" spec: affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - ml-intern-api topologyKey: kubernetes.io/hostname containers: - name: api image: ml-intern:latest imagePullPolicy: Always ports: - containerPort: 8000 name: http envFrom: - configMapRef: name: ml-intern-config env: - name: DATABASE_URL valueFrom: secretKeyRef: name: ml-intern-secrets key: DATABASE_URL - name: REDIS_URL valueFrom: secretKeyRef: name: ml-intern-secrets key: REDIS_URL - name: HF_TOKEN valueFrom: secretKeyRef: name: ml-intern-secrets key: HF_TOKEN - name: ANTHROPIC_API_KEY valueFrom: secretKeyRef: name: ml-intern-secrets key: ANTHROPIC_API_KEY - name: OPENAI_API_KEY valueFrom: secretKeyRef: name: ml-intern-secrets key: OPENAI_API_KEY - name: GROQ_API_KEY valueFrom: secretKeyRef: name: ml-intern-secrets key: GROQ_API_KEY - name: NVIDIA_API_KEY valueFrom: secretKeyRef: name: ml-intern-secrets key: NVIDIA_API_KEY resources: requests: memory: "512Mi" cpu: "250m" limits: memory: "2Gi" cpu: "2000m" livenessProbe: httpGet: path: /health port: 8000 initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3 readinessProbe: httpGet: path: /health port: 8000 initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 3 failureThreshold: 3 lifecycle: preStop: exec: command: ["/bin/sh", "-c", "sleep 15"] terminationGracePeriodSeconds: 60 --- apiVersion: v1 kind: Service metadata: name: ml-intern-api namespace: ml-intern labels: app: ml-intern-api spec: type: ClusterIP ports: - port: 8000 targetPort: 8000 protocol: TCP name: http selector: app: ml-intern-api --- apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: ml-intern-api-hpa namespace: ml-intern spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: ml-intern-api minReplicas: 2 maxReplicas: 20 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 70 - type: Resource resource: name: memory target: type: Utilization averageUtilization: 80 behavior: scaleUp: stabilizationWindowSeconds: 60 policies: - type: Percent value: 100 periodSeconds: 60 scaleDown: stabilizationWindowSeconds: 300 policies: - type: Percent value: 50 periodSeconds: 120