Go微服务监控实战指南
引言
在微服务架构中,监控系统的重要性不言而喻。本文将详细介绍如何使用Prometheus、Grafana等工具构建完整的Go微服务监控体系。
监控指标
基础指标收集
go
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
// HTTP请求计数器
RequestCounter = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
// 请求延迟直方图
RequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{0.1, 0.3, 0.5, 0.7, 1, 2, 5},
},
[]string{"method", "endpoint"},
)
// 系统资源指标
GoroutineCount = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "goroutine_count",
Help: "Number of goroutines",
},
)
)
中间件实现
go
func MetricsMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
start := time.Now()
path := c.Request.URL.Path
method := c.Request.Method
c.Next()
// 记录请求延迟
duration := time.Since(start).Seconds()
RequestDuration.WithLabelValues(method, path).Observe(duration)
// 记录请求状态
status := strconv.Itoa(c.Writer.Status())
RequestCounter.WithLabelValues(method, path, status).Inc()
}
}
链路追踪
Jaeger集成
go
func InitTracer(serviceName string) (opentracing.Tracer, error) {
cfg := jaegercfg.Configuration{
ServiceName: serviceName,
Sampler: &jaegercfg.SamplerConfig{
Type: jaeger.SamplerTypeConst,
Param: 1,
},
Reporter: &jaegercfg.ReporterConfig{
LogSpans: true,
CollectorEndpoint: "http://jaeger:14268/api/traces",
},
}
tracer, closer, err := cfg.NewTracer(jaegercfg.Logger(jaeger.StdLogger))
if err != nil {
return nil, err
}
opentracing.SetGlobalTracer(tracer)
return tracer, nil
}
日志管理
结构化日志
go
type Logger struct {
*zap.SugaredLogger
}
func NewLogger() (*Logger, error) {
config := zap.NewProductionConfig()
config.EncoderConfig.TimeKey = "timestamp"
config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
logger, err := config.Build()
if err != nil {
return nil, err
}
return &Logger{logger.Sugar()}, nil
}
健康检查
服务健康检查
go
type HealthCheck struct {
db *gorm.DB
redis *redis.Client
logger *Logger
}
func (h *HealthCheck) Check(c *gin.Context) {
status := map[string]string{
"status": "healthy",
"db": "up",
"redis": "up",
}
// 检查数据库连接
if err := h.db.Raw("SELECT 1").Error; err != nil {
status["db"] = "down"
status["status"] = "unhealthy"
}
// 检查Redis连接
if err := h.redis.Ping(c).Err(); err != nil {
status["redis"] = "down"
status["status"] = "unhealthy"
}
if status["status"] == "healthy" {
c.JSON(http.StatusOK, status)
} else {
c.JSON(http.StatusServiceUnavailable, status)
}
}
告警配置
Prometheus告警规则
yaml
groups:
- name: service_alerts
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: High error rate detected
description: Error rate is above 5% for the last 5 minutes
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: Service is down
description: Service has been down for more than 1 minute
监控面板
Grafana仪表板
json
{
"dashboard": {
"title": "Service Monitoring",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (endpoint)",
"legendFormat": "{{endpoint}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (endpoint) / sum(rate(http_requests_total[5m])) by (endpoint)",
"legendFormat": "{{endpoint}}"
}
]
}
]
}
}
最佳实践
监控覆盖全面
- 基础设施监控
- 应用性能监控
- 业务指标监控
合理的告警阈值
- 避免误报
- 分级告警
- 告警收敛
日志最佳实践
- 统一日志格式
- 合理的日志级别
- 关键信息脱敏
链路追踪
- 采样率配置
- 关键节点埋点
- 异常链路分析
性能优化
- 监控数据采集优化
- 存储策略优化
- 查询性能优化
总结
完善的监控体系是保障微服务稳定运行的关键。通过合理配置Prometheus、Jaeger等工具,结合告警和日志管理,可以构建一个全面的监控系统。
参考资料
- Prometheus官方文档
- Jaeger分布式追踪
- Google SRE手册
- Grafana可视化平台