Go+Kubernetes 1.30实战：从零构建一个AI推理平台

为什么AI推理平台需要Kubernetes？

AI推理服务有独特的运维挑战：GPU资源昂贵、模型加载慢、流量波动大、冷启动代价高。Kubernetes 1.30 + Go是解决这些挑战的最佳组合。

┌──────────────────────────────────────────────────┐
│              AI推理平台架构                         │
│                                                  │
│  ┌─────────┐  ┌─────────┐  ┌─────────┐          │
│  │ Go API  │  │ Go调度器 │  │ Go监控   │          │
│  │ Gateway │  │ Scheduler│  │ Monitor  │          │
│  └────┬────┘  └────┬────┘  └────┬────┘          │
│       │            │            │                 │
│  ┌────▼────────────▼────────────▼────┐           │
│  │        Kubernetes 1.30            │           │
│  │  ┌──────┐ ┌──────┐ ┌──────┐     │           │
│  │  │ Pod  │ │ Pod  │ │ Pod  │     │           │
│  │  │GPU 0 │ │GPU 1 │ │GPU 2 │     │           │
│  │  │Model │ │Model │ │Model │     │           │
│  │  │ A    │ │ B    │ │ A    │     │           │
│  │  └──────┘ └──────┘ └──────┘     │           │
│  └──────────────────────────────────┘           │
└──────────────────────────────────────────────────┘

Go调度器：GPU感知调度

自定义调度器

package scheduler

import (
    "context"
    "fmt"
    "sort"

    v1 "k8s.io/api/core/v1"
    "k8s.io/kubernetes/pkg/scheduler/framework"
)

type GPUScheduler struct {
    handle framework.Handle
}

func (s *GPUScheduler) Filter(ctx context.Context, state *framework.CycleState,
    pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {

    gpuRequest := getGPURequest(pod)
    if gpuRequest == 0 {
        return nil
    }

    availableGPU := getAvailableGPU(nodeInfo)
    if availableGPU < gpuRequest {
        return framework.NewStatus(framework.Unschedulable,
            fmt.Sprintf("insufficient GPU: need %d, available %d", gpuRequest, availableGPU))
    }

    return nil
}

func (s *GPUScheduler) Score(ctx context.Context, state *framework.CycleState,
    pod *v1.Pod, nodeName string) (int64, *framework.Status) {

    nodeInfo, _ := s.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)

    gpuUtilization := getGPUUtilization(nodeInfo)

    // 优先选择GPU利用率低的节点（负载均衡）
    score := int64(100 - gpuUtilization)
    return score, nil
}

func getGPURequest(pod *v1.Pod) int64 {
    for _, container := range pod.Spec.Containers {
        if limit, ok := container.Resources.Limits[v1.ResourceName("nvidia.com/gpu")]; ok {
            return limit.Value()
        }
    }
    return 0
}

模型服务：vLLM + Kubernetes

Deployment配置

apiVersion: apps/v1
kind: Deployment
metadata:
  name: llm-inference
  labels:
    app: llm-inference
    model: qwen2.5-72b
spec:
  replicas: 3
  selector:
    matchLabels:
      app: llm-inference
  template:
    metadata:
      labels:
        app: llm-inference
    spec:
      schedulerName: gpu-scheduler
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
        resources:
          limits:
            nvidia.com/gpu: "2"
          requests:
            nvidia.com/gpu: "2"
            memory: "32Gi"
        env:
        - name: MODEL_NAME
          value: "Qwen/Qwen2.5-72B-Instruct"
        - name: GPU_MEMORY_UTILIZATION
          value: "0.90"
        - name: MAX_NUM_SEQS
          value: "256"
        ports:
        - containerPort: 8000
        readinessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 120
          periodSeconds: 10
        volumeMounts:
        - name: model-cache
          mountPath: /models
      volumes:
      - name: model-cache
        persistentVolumeClaim:
          claimName: model-cache-pvc
      tolerations:
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule

HPA自动扩缩容

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: llm-inference-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: llm-inference
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Pods
    pods:
      metric:
        name: vllm_num_requests_running
      target:
        type: AverageValue
        averageValue: "128"
  - type: Resource
    resource:
      name: nvidia.com/gpu-utilization
      target:
        type: Utilization
        averageUtilization: 80
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
      - type: Pods
        value: 2
        periodSeconds: 60
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
      - type: Pods
        value: 1
        periodSeconds: 120

Go API网关

模型路由与负载均衡

package gateway

import (
    "net/http"
    "net/http/httputil"
    "net/url"
    "sync/atomic"

    "github.com/gin-gonic/gin"
)

type ModelRouter struct {
    endpoints map[string][]*url.URL
    counters  map[string]*atomic.Uint64
}

func NewModelRouter() *ModelRouter {
    return &ModelRouter{
        endpoints: make(map[string][]*url.URL),
        counters:  make(map[string]*atomic.Uint64),
    }
}

func (r *ModelRouter) RegisterModel(model string, urls ...string) {
    for _, u := range urls {
        parsed, _ := url.Parse(u)
        r.endpoints[model] = append(r.endpoints[model], parsed)
    }
    r.counters[model] = &atomic.Uint64{}
}

func (r *ModelRouter) ProxyHandler(c *gin.Context) {
    model := c.Param("model")
    endpoints, ok := r.endpoints[model]
    if !ok {
        c.JSON(http.StatusNotFound, gin.H{"error": "model not found"})
        return
    }

    counter := r.counters[model]
    idx := counter.Add(1) % uint64(len(endpoints))
    target := endpoints[idx]

    proxy := httputil.NewSingleHostReverseProxy(target)
    proxy.ServeHTTP(c.Writer, c.Request)
}

金丝雀部署

apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
  name: llm-inference
spec:
  targetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: llm-inference
  service:
    port: 8000
  analysis:
    interval: 1m
    threshold: 5
    maxWeight: 50
    stepWeight: 10
    metrics:
    - name: request-success-rate
      thresholdRange:
        min: 99
      interval: 1m
    - name: request-duration
      thresholdRange:
        max: 500
      interval: 1m
    - name: vllm-num-requests-waiting
      thresholdRange:
        max: 50
      interval: 1m

监控与可观测性

# Prometheus自定义指标
apiVersion: v1
kind: ConfigMap
metadata:
  name: vllm-metrics-config
data:
  vllm-rules.yml: |
    groups:
    - name: vllm
      rules:
      - alert: HighQueueLength
        expr: vllm_num_requests_waiting > 100
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "LLM推理队列过长"
      - alert: HighGPUUtilization
        expr: nvidia_gpu_utilization > 95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "GPU利用率过高"

总结

Go + Kubernetes构建AI推理平台的核心价值：

Go调度器：GPU感知调度，优化GPU资源利用率
K8s HPA：基于推理队列长度自动扩缩容
金丝雀部署：模型更新零停机，自动回滚
可观测性：GPU利用率、队列长度、推理延迟全链路监控

AI推理平台不是"部署一个模型"，而是"运营一套系统"——Kubernetes是基石，Go是胶水。