Go + Kubernetes 1.30: Building an AI Inference Platform from Scratch

Why AI Inference Needs Kubernetes?

AI inference has unique ops challenges: expensive GPUs, slow model loading, traffic volatility, costly cold starts. Kubernetes 1.30 + Go is the best combination to solve these.

Go Scheduler: GPU-Aware Scheduling

package scheduler

import (
    v1 "k8s.io/api/core/v1"
    "k8s.io/kubernetes/pkg/scheduler/framework"
)

type GPUScheduler struct {
    handle framework.Handle
}

func (s *GPUScheduler) Filter(ctx context.Context, state *framework.CycleState,
    pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {

    gpuRequest := getGPURequest(pod)
    if gpuRequest == 0 {
        return nil
    }

    availableGPU := getAvailableGPU(nodeInfo)
    if availableGPU < gpuRequest {
        return framework.NewStatus(framework.Unschedulable,
            fmt.Sprintf("insufficient GPU: need %d, available %d", gpuRequest, availableGPU))
    }

    return nil
}

func (s *GPUScheduler) Score(ctx context.Context, state *framework.CycleState,
    pod *v1.Pod, nodeName string) (int64, *framework.Status) {

    nodeInfo, _ := s.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
    gpuUtilization := getGPUUtilization(nodeInfo)
    score := int64(100 - gpuUtilization)
    return score, nil
}

Model Serving: vLLM + Kubernetes

apiVersion: apps/v1
kind: Deployment
metadata:
  name: llm-inference
spec:
  replicas: 3
  template:
    spec:
      schedulerName: gpu-scheduler
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
        resources:
          limits:
            nvidia.com/gpu: "2"
        env:
        - name: MODEL_NAME
          value: "Qwen/Qwen2.5-72B-Instruct"

HPA Auto-scaling

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: llm-inference-hpa
spec:
  scaleTargetRef:
    kind: Deployment
    name: llm-inference
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Pods
    pods:
      metric:
        name: vllm_num_requests_running
      target:
        type: AverageValue
        averageValue: "128"

Go API Gateway

type ModelRouter struct {
    endpoints map[string][]*url.URL
    counters  map[string]*atomic.Uint64
}

func (r *ModelRouter) ProxyHandler(c *gin.Context) {
    model := c.Param("model")
    endpoints, ok := r.endpoints[model]
    if !ok {
        c.JSON(http.StatusNotFound, gin.H{"error": "model not found"})
        return
    }

    counter := r.counters[model]
    idx := counter.Add(1) % uint64(len(endpoints))
    target := endpoints[idx]

    proxy := httputil.NewSingleHostReverseProxy(target)
    proxy.ServeHTTP(c.Writer, c.Request)
}

Summary

Go scheduler: GPU-aware scheduling for optimal GPU utilization
K8s HPA: Auto-scaling based on inference queue length
Canary deployment: Zero-downtime model updates with auto-rollback
Observability: GPU utilization, queue length, inference latency full-chain monitoring

AI inference platform isn't "deploying a model" — it's "operating a system." Kubernetes is the foundation, Go is the glue.