ml-intern-local-fork / production /k8s /deployment-api.yml
raazkumar's picture
Upload production/k8s/deployment-api.yml
a82b7c7 verified
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-intern-api
namespace: ml-intern
labels:
app: ml-intern-api
version: v1
spec:
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: ml-intern-api
template:
metadata:
labels:
app: ml-intern-api
version: v1
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8000"
prometheus.io/path: "/metrics"
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- ml-intern-api
topologyKey: kubernetes.io/hostname
containers:
- name: api
image: ml-intern:latest
imagePullPolicy: Always
ports:
- containerPort: 8000
name: http
envFrom:
- configMapRef:
name: ml-intern-config
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: ml-intern-secrets
key: DATABASE_URL
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: ml-intern-secrets
key: REDIS_URL
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: ml-intern-secrets
key: HF_TOKEN
- name: ANTHROPIC_API_KEY
valueFrom:
secretKeyRef:
name: ml-intern-secrets
key: ANTHROPIC_API_KEY
- name: OPENAI_API_KEY
valueFrom:
secretKeyRef:
name: ml-intern-secrets
key: OPENAI_API_KEY
- name: GROQ_API_KEY
valueFrom:
secretKeyRef:
name: ml-intern-secrets
key: GROQ_API_KEY
- name: NVIDIA_API_KEY
valueFrom:
secretKeyRef:
name: ml-intern-secrets
key: NVIDIA_API_KEY
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "2000m"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 15"]
terminationGracePeriodSeconds: 60
---
apiVersion: v1
kind: Service
metadata:
name: ml-intern-api
namespace: ml-intern
labels:
app: ml-intern-api
spec:
type: ClusterIP
ports:
- port: 8000
targetPort: 8000
protocol: TCP
name: http
selector:
app: ml-intern-api
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ml-intern-api-hpa
namespace: ml-intern
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ml-intern-api
minReplicas: 2
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 100
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 50
periodSeconds: 120