Go+Kubernetes 1.30实战:从零构建一个AI推理平台
技术架构
为什么AI推理平台需要Kubernetes?
AI推理服务有独特的运维挑战:GPU资源昂贵、模型加载慢、流量波动大、冷启动代价高。Kubernetes 1.30 + Go是解决这些挑战的最佳组合。
┌──────────────────────────────────────────────────┐
│ AI推理平台架构 │
│ │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ Go API │ │ Go调度器 │ │ Go监控 │ │
│ │ Gateway │ │ Scheduler│ │ Monitor │ │
│ └────┬────┘ └────┬────┘ └────┬────┘ │
│ │ │ │ │
│ ┌────▼────────────▼────────────▼────┐ │
│ │ Kubernetes 1.30 │ │
│ │ ┌──────┐ ┌──────┐ ┌──────┐ │ │
│ │ │ Pod │ │ Pod │ │ Pod │ │ │
│ │ │GPU 0 │ │GPU 1 │ │GPU 2 │ │ │
│ │ │Model │ │Model │ │Model │ │ │
│ │ │ A │ │ B │ │ A │ │ │
│ │ └──────┘ └──────┘ └──────┘ │ │
│ └──────────────────────────────────┘ │
└──────────────────────────────────────────────────┘
Go调度器:GPU感知调度
自定义调度器
package scheduler
import (
"context"
"fmt"
"sort"
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
type GPUScheduler struct {
handle framework.Handle
}
func (s *GPUScheduler) Filter(ctx context.Context, state *framework.CycleState,
pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
gpuRequest := getGPURequest(pod)
if gpuRequest == 0 {
return nil
}
availableGPU := getAvailableGPU(nodeInfo)
if availableGPU < gpuRequest {
return framework.NewStatus(framework.Unschedulable,
fmt.Sprintf("insufficient GPU: need %d, available %d", gpuRequest, availableGPU))
}
return nil
}
func (s *GPUScheduler) Score(ctx context.Context, state *framework.CycleState,
pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, _ := s.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
gpuUtilization := getGPUUtilization(nodeInfo)
// 优先选择GPU利用率低的节点(负载均衡)
score := int64(100 - gpuUtilization)
return score, nil
}
func getGPURequest(pod *v1.Pod) int64 {
for _, container := range pod.Spec.Containers {
if limit, ok := container.Resources.Limits[v1.ResourceName("nvidia.com/gpu")]; ok {
return limit.Value()
}
}
return 0
}
模型服务:vLLM + Kubernetes
Deployment配置
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-inference
labels:
app: llm-inference
model: qwen2.5-72b
spec:
replicas: 3
selector:
matchLabels:
app: llm-inference
template:
metadata:
labels:
app: llm-inference
spec:
schedulerName: gpu-scheduler
containers:
- name: vllm
image: vllm/vllm-openai:latest
resources:
limits:
nvidia.com/gpu: "2"
requests:
nvidia.com/gpu: "2"
memory: "32Gi"
env:
- name: MODEL_NAME
value: "Qwen/Qwen2.5-72B-Instruct"
- name: GPU_MEMORY_UTILIZATION
value: "0.90"
- name: MAX_NUM_SEQS
value: "256"
ports:
- containerPort: 8000
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 120
periodSeconds: 10
volumeMounts:
- name: model-cache
mountPath: /models
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache-pvc
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
HPA自动扩缩容
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: llm-inference-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: llm-inference
minReplicas: 2
maxReplicas: 10
metrics:
- type: Pods
pods:
metric:
name: vllm_num_requests_running
target:
type: AverageValue
averageValue: "128"
- type: Resource
resource:
name: nvidia.com/gpu-utilization
target:
type: Utilization
averageUtilization: 80
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Pods
value: 2
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 1
periodSeconds: 120
Go API网关
模型路由与负载均衡
package gateway
import (
"net/http"
"net/http/httputil"
"net/url"
"sync/atomic"
"github.com/gin-gonic/gin"
)
type ModelRouter struct {
endpoints map[string][]*url.URL
counters map[string]*atomic.Uint64
}
func NewModelRouter() *ModelRouter {
return &ModelRouter{
endpoints: make(map[string][]*url.URL),
counters: make(map[string]*atomic.Uint64),
}
}
func (r *ModelRouter) RegisterModel(model string, urls ...string) {
for _, u := range urls {
parsed, _ := url.Parse(u)
r.endpoints[model] = append(r.endpoints[model], parsed)
}
r.counters[model] = &atomic.Uint64{}
}
func (r *ModelRouter) ProxyHandler(c *gin.Context) {
model := c.Param("model")
endpoints, ok := r.endpoints[model]
if !ok {
c.JSON(http.StatusNotFound, gin.H{"error": "model not found"})
return
}
counter := r.counters[model]
idx := counter.Add(1) % uint64(len(endpoints))
target := endpoints[idx]
proxy := httputil.NewSingleHostReverseProxy(target)
proxy.ServeHTTP(c.Writer, c.Request)
}
金丝雀部署
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: llm-inference
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: llm-inference
service:
port: 8000
analysis:
interval: 1m
threshold: 5
maxWeight: 50
stepWeight: 10
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 500
interval: 1m
- name: vllm-num-requests-waiting
thresholdRange:
max: 50
interval: 1m
监控与可观测性
# Prometheus自定义指标
apiVersion: v1
kind: ConfigMap
metadata:
name: vllm-metrics-config
data:
vllm-rules.yml: |
groups:
- name: vllm
rules:
- alert: HighQueueLength
expr: vllm_num_requests_waiting > 100
for: 2m
labels:
severity: warning
annotations:
summary: "LLM推理队列过长"
- alert: HighGPUUtilization
expr: nvidia_gpu_utilization > 95
for: 5m
labels:
severity: critical
annotations:
summary: "GPU利用率过高"
总结
Go + Kubernetes构建AI推理平台的核心价值:
- Go调度器:GPU感知调度,优化GPU资源利用率
- K8s HPA:基于推理队列长度自动扩缩容
- 金丝雀部署:模型更新零停机,自动回滚
- 可观测性:GPU利用率、队列长度、推理延迟全链路监控
AI推理平台不是"部署一个模型",而是"运营一套系统"——Kubernetes是基石,Go是胶水。
本站提供浏览器本地工具,免注册即可试用 →
#Go#Kubernetes#AI推理#GPU调度#云原生