Go OpenTelemetry分散式追蹤:從零接入到全鏈路可觀測的6個關鍵步驟

DevOps

微服務呼叫鏈成了黑箱

使用者回饋「下單慢」,你開啟日誌,看到的是一堆散落在十幾個服務裡的時間戳——訂單服務3ms、庫存服務2ms、支付服務……超時了?還是沒調到?你根本不知道一個請求經過了哪些服務、在哪個環節卡住。分散式追蹤就是解決這個問題的銀彈,而OpenTelemetry(OTel)已經成了事實標準。

本文將從零開始,帶你完成OTel SDK初始化→Trace/Span建立→上下文傳播→自動埋點→Jaeger/Tempo整合→指標關聯的6個關鍵步驟,讓微服務呼叫鏈從黑箱變成透明管道。


OpenTelemetry核心概念

概念 說明
Trace 一次完整請求的追蹤鏈路,由多個Span組成
Span 單個操作單元,包含名稱、耗時、狀態、屬性等
Context 追蹤上下文,包含TraceID/SpanID,跨程序傳播
Propagator 上下文傳播器,負責在HTTP/gRPC標頭注入和提取Context
TracerProvider Tracer工廠,負責建立和管理Tracer實例
SpanProcessor Span處理器,負責Span的批處理、過濾和匯出
Exporter 匯出器,將Span資料傳送到Jaeger/Tempo/OTLP等後端
Resource 資源描述,標識產生遙測資料的服務(服務名、版本等)

追蹤資料流

請求流程:
1. 入口服務收到請求,建立Root Span
2. 呼叫下游服務時,Propagator將Context注入HTTP/gRPC標頭
3. 下游服務從標頭提取Context,建立Child Span
4. Span完成後由SpanProcessor批處理
5. Exporter將Span資料傳送到Jaeger/Tempo
6. 在UI中檢視完整的呼叫鏈路圖

問題分析:分散式追蹤接入的5大挑戰

  1. SDK初始化複雜:TracerProvider、SpanProcessor、Exporter、Resource四者設定順序和依賴關係容易搞混
  2. 上下文傳播遺漏:跨服務呼叫時忘記傳播Context,導致鏈路斷裂
  3. Span粒度失控:粒度太粗看不到瓶頸,太細產生海量資料拖垮後端
  4. 自動埋點與手動埋點衝突:HTTP/gRPC自動埋點和業務手動Span容易重複或巢狀錯誤
  5. 指標與追蹤割裂:Metrics和Traces各自為戰,無法透過指標定位到具體Trace

分步實操:完整OTel接入

Step 1:初始化TracerProvider

package telemetry

import (
	"context"
	"fmt"
	"time"

	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
	"go.opentelemetry.io/otel/propagation"
	"go.opentelemetry.io/otel/sdk/resource"
	sdktrace "go.opentelemetry.io/otel/sdk/trace"
	semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
)

type Telemetry struct {
	provider *sdktrace.TracerProvider
}

func InitTelemetry(ctx context.Context, serviceName, serviceVersion, otlpEndpoint string) (*Telemetry, error) {
	exporter, err := otlptracegrpc.New(ctx,
		otlptracegrpc.WithEndpoint(otlpEndpoint),
		otlptracegrpc.WithInsecure(),
	)
	if err != nil {
		return nil, fmt.Errorf("create OTLP exporter: %w", err)
	}

	res, err := resource.New(ctx,
		resource.WithAttributes(
			semconv.ServiceNameKey.String(serviceName),
			semconv.ServiceVersionKey.String(serviceVersion),
		),
	)
	if err != nil {
		return nil, fmt.Errorf("create resource: %w", err)
	}

	bsp := sdktrace.NewBatchSpanProcessor(exporter,
		sdktrace.WithBatchTimeout(5*time.Second),
		sdktrace.WithMaxExportBatchSize(512),
		sdktrace.WithMaxQueueSize(2048),
	)

	provider := sdktrace.NewTracerProvider(
		sdktrace.WithResource(res),
		sdktrace.WithSpanProcessor(bsp),
		sdktrace.WithSampler(sdktrace.ParentBased(
			sdktrace.TraceIDRatioBased(0.5),
		)),
	)

	otel.SetTracerProvider(provider)
	otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
		propagation.TraceContext{},
		propagation.Baggage{},
	))

	return &Telemetry{provider: provider}, nil
}

func (t *Telemetry) Shutdown(ctx context.Context) error {
	return t.provider.Shutdown(ctx)
}

Step 2:建立Trace和Span

package service

import (
	"context"
	"fmt"

	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/codes"
	"go.opentelemetry.io/otel/trace"
)

var tracer = otel.Tracer("order-service")

func ProcessOrder(ctx context.Context, orderID string) error {
	ctx, span := tracer.Start(ctx, "ProcessOrder",
		trace.WithAttributes(
			attribute.String("order.id", orderID),
		),
		trace.WithSpanKind(trace.SpanKindInternal),
	)
	defer span.End()

	if err := validateOrder(ctx, orderID); err != nil {
		span.RecordError(err)
		span.SetStatus(codes.Error, err.Error())
		return err
	}

	if err := reserveInventory(ctx, orderID); err != nil {
		span.RecordError(err)
		span.SetStatus(codes.Error, err.Error())
		return err
	}

	span.SetStatus(codes.Ok, "")
	return nil
}

func validateOrder(ctx context.Context, orderID string) error {
	ctx, span := tracer.Start(ctx, "validateOrder",
		trace.WithAttributes(attribute.String("order.id", orderID)),
	)
	defer span.End()

	if orderID == "" {
		err := fmt.Errorf("order ID is empty")
		span.RecordError(err)
		span.SetStatus(codes.Error, err.Error())
		return err
	}

	span.AddEvent("validation_passed", trace.WithAttributes(
		attribute.String("order.id", orderID),
	))
	return nil
}

func reserveInventory(ctx context.Context, orderID string) error {
	ctx, span := tracer.Start(ctx, "reserveInventory")
	defer span.End()

	span.SetAttributes(attribute.String("order.id", orderID))
	return nil
}

Step 3:HTTP上下文傳播

package middleware

import (
	"net/http"

	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/propagation"
	"go.opentelemetry.io/otel/trace"
)

func HTTPMiddleware(next http.Handler) http.Handler {
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		propagator := otel.GetTextMapPropagator()
		ctx := propagator.Extract(r.Context(), propagation.HeaderCarrier(r.Header))

		tracer := otel.Tracer("http-server")
		spanName := r.Method + " " + r.URL.Path
		ctx, span := tracer.Start(ctx, spanName,
			trace.WithSpanKind(trace.SpanKindServer),
			trace.WithAttributes(
				attribute.String("http.method", r.Method),
				attribute.String("http.url", r.URL.String()),
				attribute.String("http.host", r.Host),
			),
		)
		defer span.End()

		rw := &responseWriter{ResponseWriter: w, statusCode: 200}
		next.ServeHTTP(rw, r.WithContext(ctx))

		span.SetAttributes(
			attribute.Int("http.status_code", rw.statusCode),
		)
	})
}

type responseWriter struct {
	http.ResponseWriter
	statusCode int
}

func (rw *responseWriter) WriteHeader(code int) {
	rw.statusCode = code
	rw.ResponseWriter.WriteHeader(code)
}

Step 4:出站HTTP呼叫傳播Context

package client

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"time"

	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/propagation"
	"go.opentelemetry.io/otel/trace"
)

type InstrumentedClient struct {
	client *http.Client
}

func NewInstrumentedClient() *InstrumentedClient {
	return &InstrumentedClient{
		client: &http.Client{Timeout: 30 * time.Second},
	}
}

func (c *InstrumentedClient) Do(ctx context.Context, method, url string, body io.Reader) (*http.Response, error) {
	tracer := otel.Tracer("http-client")
	ctx, span := tracer.Start(ctx, method+" "+url,
		trace.WithSpanKind(trace.SpanKindClient),
		trace.WithAttributes(
			attribute.String("http.method", method),
			attribute.String("http.url", url),
		),
	)
	defer span.End()

	req, err := http.NewRequestWithContext(ctx, method, url, body)
	if err != nil {
		span.RecordError(err)
		return nil, fmt.Errorf("create request: %w", err)
	}

	propagator := otel.GetTextMapPropagator()
	propagator.Inject(ctx, propagation.HeaderCarrier(req.Header))

	resp, err := c.client.Do(req)
	if err != nil {
		span.RecordError(err)
		return nil, fmt.Errorf("execute request: %w", err)
	}

	span.SetAttributes(attribute.Int("http.status_code", resp.StatusCode))
	return resp, nil
}

Step 5:gRPC自動埋點

package main

import (
	"context"

	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"
)

func NewGRPCClient(ctx context.Context, target string) (*grpc.ClientConn, error) {
	conn, err := grpc.DialContext(ctx, target,
		grpc.WithTransportCredentials(insecure.NewCredentials()),
		grpc.WithStatsHandler(otelgrpc.NewClientHandler()),
	)
	if err != nil {
		return nil, err
	}
	return conn, nil
}

func NewGRPCServer() *grpc.Server {
	server := grpc.NewServer(
		grpc.StatsHandler(otelgrpc.NewServerHandler()),
	)
	return server
}

Step 6:Metrics與Trace關聯

package telemetry

import (
	"context"

	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
	"go.opentelemetry.io/otel/sdk/resource"
	semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
)

type MetricsProvider struct {
	provider *sdkmetric.MeterProvider
}

func InitMetrics(ctx context.Context, serviceName, otlpEndpoint string) (*MetricsProvider, error) {
	exporter, err := otlpmetricgrpc.New(ctx,
		otlpmetricgrpc.WithEndpoint(otlpEndpoint),
		otlpmetricgrpc.WithInsecure(),
	)
	if err != nil {
		return nil, err
	}

	res, err := resource.New(ctx,
		resource.WithAttributes(
			semconv.ServiceNameKey.String(serviceName),
		),
	)
	if err != nil {
		return nil, err
	}

	provider := sdkmetric.NewMeterProvider(
		sdkmetric.WithResource(res),
		sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)),
	)

	otel.SetMeterProvider(provider)
	return &MetricsProvider{provider: provider}, nil
}

func (m *MetricsProvider) Shutdown(ctx context.Context) error {
	return m.provider.Shutdown(ctx)
}

避坑指南

坑1:忘記設定全域Propagator

// ❌ 錯誤:沒有設定Propagator,Context無法跨程序傳播
provider := sdktrace.NewTracerProvider(...)
otel.SetTracerProvider(provider)
// 缺少 otel.SetTextMapPropagator(...)

// ✅ 正確:設定Composite Propagator
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
    propagation.TraceContext{},
    propagation.Baggage{},
))

坑2:Span沒有呼叫End()

// ❌ 錯誤:Span永遠不會結束,不會被匯出
ctx, span := tracer.Start(ctx, "operation")
doWork(ctx)
// 忘記 span.End()

// ✅ 正確:使用defer確保Span結束
ctx, span := tracer.Start(ctx, "operation")
defer span.End()
doWork(ctx)

坑3:取樣率設定不當

// ❌ 錯誤:生產環境AlwaysSample導致海量資料
sdktrace.WithSampler(sdktrace.AlwaysSample())

// ✅ 正確:使用ParentBased+TraceIDRatioBased
sdktrace.WithSampler(sdktrace.ParentBased(
    sdktrace.TraceIDRatioBased(0.1), // 取樣10%
))

坑4:在goroutine中遺失Context

// ❌ 錯誤:goroutine中沒有傳遞ctx
go func() {
    ctx, span := tracer.Start(context.Background(), "async_work")
    defer span.End()
}()

// ✅ 正確:將父Context傳入goroutine
go func(ctx context.Context) {
    ctx, span := tracer.Start(ctx, "async_work")
    defer span.End()
}(ctx)

坑5:Shutdown超時導致資料遺失

// ❌ 錯誤:沒有給Shutdown足夠時間
func main() {
    tel, _ := telemetry.InitTelemetry(ctx, "svc", "1.0", "localhost:4317")
    defer tel.Shutdown(context.Background()) // 可能超時
}

// ✅ 正確:給Shutdown足夠的超時時間
func main() {
    tel, _ := telemetry.InitTelemetry(ctx, "svc", "1.0", "localhost:4317")
    defer func() {
        shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
        defer cancel()
        tel.Shutdown(shutdownCtx)
    }()
}

報錯排查

序號 報錯訊息 原因 解決方法
1 connection refused: localhost:4317 OTLP Collector未啟動 啟動otel-collector容器,檢查連接埠對映
2 traces not showing in Jaeger Exporter設定錯誤或取樣率為0 檢查Exporter目標位址,確認取樣率>0
3 context deadline exceeded Collector回應慢或網路不通 增加超時時間,檢查網路連通性
4 span missing parent 上下文傳播失敗 確認Propagator已設定,檢查HTTP標頭注入
5 resource attributes missing Resource未設定 新增resource.WithAttributes(semconv.ServiceNameKey.String(...))
6 too many open files Span佇列積壓,Exporter傳送阻塞 減小MaxQueueSize,增加BatchTimeout
7 trace_id not found in baggage Baggage和TraceContext混淆 TraceContext傳播TraceID,Baggage傳播業務資料
8 grpc: no transport security gRPC使用了WithInsecure 開發環境可接受,生產環境設定TLS
9 duplicate span name 多個Span同名導致混淆 為Span新增區分性屬性或使用動態名稱
10 metric reader timeout Metric匯出超時 宣告PeriodicReader時增加Interval和Timeout

進階最佳化

1. 自定義SpanProcessor實現敏感資料過濾

package telemetry

import (
	"context"
	"strings"

	"go.opentelemetry.io/otel/attribute"
	sdktrace "go.opentelemetry.io/otel/sdk/trace"
)

type sanitizingProcessor struct {
	next          sdktrace.SpanProcessor
	sensitiveKeys []string
}

func NewSanitizingProcessor(next sdktrace.SpanProcessor, sensitiveKeys []string) sdktrace.SpanProcessor {
	return &sanitizingProcessor{next: next, sensitiveKeys: sensitiveKeys}
}

func (p *sanitizingProcessor) OnStart(ctx context.Context, s sdktrace.ReadWriteSpan) {
	p.next.OnStart(ctx, s)
}

func (p *sanitizingProcessor) OnEnd(s sdktrace.ReadOnlySpan) {
	attrs := s.Attributes()
	filtered := make([]attribute.KeyValue, 0, len(attrs))
	for _, attr := range attrs {
		if p.isSensitive(string(attr.Key)) {
			filtered = append(filtered, attribute.String(string(attr.Key), "[REDACTED]"))
		} else {
			filtered = append(filtered, attr)
		}
	}
	p.next.OnEnd(s)
}

func (p *sanitizingProcessor) isSensitive(key string) bool {
	for _, sk := range p.sensitiveKeys {
		if strings.Contains(strings.ToLower(key), strings.ToLower(sk)) {
			return true
		}
	}
	return false
}

func (p *sanitizingProcessor) ForceFlush(ctx context.Context) error {
	return p.next.ForceFlush(ctx)
}

func (p *sanitizingProcessor) Shutdown(ctx context.Context) error {
	return p.next.Shutdown(ctx)
}

2. 基於錯誤率的動態取樣

package telemetry

import (
	"sync/atomic"

	sdktrace "go.opentelemetry.io/otel/sdk/trace"
)

type errorAwareSampler struct {
	errorCount atomic.Int64
	totalCount atomic.Int64
	baseRatio  float64
	errorRatio float64
}

func NewErrorAwareSampler(baseRatio, errorRatio float64) sdktrace.Sampler {
	return &errorAwareSampler{baseRatio: baseRatio, errorRatio: errorRatio}
}

func (s *errorAwareSampler) ShouldSample(p sdktrace.SamplingParameters) sdktrace.SamplingResult {
	s.totalCount.Add(1)

	for _, attr := range p.Attributes {
		if attr.Key == "error" {
			s.errorCount.Add(1)
		}
	}

	ratio := s.baseRatio
	if s.errorCount.Load() > 0 {
		errorRate := float64(s.errorCount.Load()) / float64(s.totalCount.Load())
		if errorRate > 0.01 {
			ratio = s.errorRatio
		}
	}

	return sdktrace.TraceIDRatioBased(ratio).ShouldSample(p)
}

func (s *errorAwareSampler) Description() string {
	return "ErrorAwareSampler"
}

3. Trace與Log關聯

package telemetry

import (
	"context"
	"log/slog"

	"go.opentelemetry.io/otel/trace"
)

type traceHandler struct {
	next slog.Handler
}

func NewTraceHandler(next slog.Handler) slog.Handler {
	return &traceHandler{next: next}
}

func (h *traceHandler) Handle(ctx context.Context, r slog.Record) error {
	spanCtx := trace.SpanContextFromContext(ctx)
	if spanCtx.IsValid() {
		r.AddAttrs(
			slog.String("trace_id", spanCtx.TraceID().String()),
			slog.String("span_id", spanCtx.SpanID().String()),
		)
	}
	return h.next.Handle(ctx, r)
}

func (h *traceHandler) Enabled(ctx context.Context, level slog.Level) bool {
	return h.next.Enabled(ctx, level)
}

func (h *traceHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
	return &traceHandler{next: h.next.WithAttrs(attrs)}
}

func (h *traceHandler) WithGroup(name string) slog.Handler {
	return &traceHandler{next: h.next.WithGroup(name)}
}

對比分析

維度 OpenTelemetry Jaeger Client Zipkin Brave SkyWalking Datadog APM
廠商中立 ✅CNCF標準 ⚠️僅Jaeger ⚠️僅Zipkin ❌Apache但生態封閉 ❌商業
多語言支援 ✅11+語言 ⚠️6種 ⚠️Java為主 ⚠️8種 ✅10+
Metrics整合 ✅原生 ❌需Prometheus
自動埋點 ✅HTTP/gRPC ⚠️有限
取樣策略 ✅靈活 ⚠️簡單 ⚠️簡單 ❌固定
社群活躍度 ⭐極高 ⭐高 ⭐中 ⭐高 ⭐商業
成本 免費 免費 免費 免費 $31/月起

總結:OpenTelemetry不是又一個APM工具,而是可觀測性的基礎設施層。它的核心價值在於:一次接入、多後端匯出、Trace/Metrics/Logs三合一。2026年的最佳實踐:用OTel SDK統一埋點→OTLP協定傳送到Collector→Collector路由到Jaeger(Trace)+Prometheus(Metrics)+Loki(Log)。關鍵是在SDK初始化時就設定好Propagator和取樣策略,避免後期鏈路斷裂或資料洪峰。


線上工具推薦

本站提供瀏覽器本地工具,免註冊即可試用 →

#Go#OpenTelemetry#分布式追踪#可观测性#链路追踪#2026#DevOps