Prometheus 监控:让你的应用可观测
在生产环境中,了解应用的运行状态至关重要。Prometheus 是一个开源的监控系统,配合 Grafana 可以实现强大的指标收集和可视化。
本文将介绍如何在 Go 应用中集成 Prometheus 监控。
什么是 Prometheus?
Prometheus 是一个时间序列数据库和监控系统,具有以下特点:
- 多维数据模型:通过指标名称和键值对组织数据
- PromQL:强大的查询语言
- 拉取模式:主动从目标抓取指标
- 服务发现:自动发现监控目标
- 告警:基于规则的告警机制
安装依赖
go get github.com/prometheus/client_golang/prometheus
go get github.com/prometheus/client_golang/prometheus/promhttp
基础指标暴露
package main
import (
"log"
"net/http"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
func main() {
// 暴露 Prometheus 指标端点
http.Handle("/metrics", promhttp.Handler())
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("Hello, World!"))
})
log.Println("Server starting on :8080")
http.ListenAndServe(":8080", nil)
}
访问 http://localhost:8080/metrics 可以看到默认的 Go 运行时指标。
自定义指标
Counter(计数器)
只能递增的计数器:
package main
import (
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// 请求总数
requestTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
)
func init() {
// 注册指标
prometheus.MustRegister(requestTotal)
}
func requestMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// 包装 ResponseWriter 以捕获状态码
wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
next.ServeHTTP(wrapped, r)
// 增加计数
requestTotal.WithLabelValues(
r.Method,
r.URL.Path,
http.StatusText(wrapped.statusCode),
).Inc()
})
}
type responseWriter struct {
http.ResponseWriter
statusCode int
}
func (rw *responseWriter) WriteHeader(code int) {
rw.statusCode = code
rw.ResponseWriter.WriteHeader(code)
}
func main() {
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("Hello"))
})
handler := requestMiddleware(mux)
http.Handle("/metrics", promhttp.Handler())
http.Handle("/", handler)
http.ListenAndServe(":8080", nil)
}
Gauge(仪表盘)
可增可减的指标:
var (
// 当前在线用户数
onlineUsers = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "online_users",
Help: "Current number of online users",
},
)
// 内存使用量
memoryUsage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "memory_usage_bytes",
Help: "Memory usage in bytes",
},
[]string{"type"},
)
)
func init() {
prometheus.MustRegister(onlineUsers, memoryUsage)
}
// 用户登录
func onUserLogin() {
onlineUsers.Inc()
}
// 用户登出
func onUserLogout() {
onlineUsers.Dec()
}
// 更新内存使用
func updateMemoryUsage() {
var m runtime.MemStats
runtime.ReadMemStats(&m)
memoryUsage.WithLabelValues("alloc").Set(float64(m.Alloc))
memoryUsage.WithLabelValues("sys").Set(float64(m.Sys))
memoryUsage.WithLabelValues("heap").Set(float64(m.HeapAlloc))
}
Histogram(直方图)
统计分布:
var (
// 请求延迟
requestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
)
)
func init() {
prometheus.MustRegister(requestDuration)
}
func durationMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
next.ServeHTTP(w, r)
duration := time.Since(start).Seconds()
requestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
})
}
Summary(摘要)
类似直方图,但计算分位数:
var (
// 响应大小
responseSize = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "http_response_size_bytes",
Help: "HTTP response size in bytes",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"endpoint"},
)
)
完整的监控示例
package main
import (
"log"
"net/http"
"runtime"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// 计数器
httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
// 仪表盘
activeConnections = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "Number of active connections",
},
)
// 直方图
httpRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5},
},
[]string{"method", "endpoint"},
)
// 摘要
httpResponseSize = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "http_response_size_bytes",
Help: "HTTP response size in bytes",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
MaxAge: 5 * time.Minute,
},
[]string{"endpoint"},
)
)
func init() {
// 注册自定义指标
prometheus.MustRegister(
httpRequestsTotal,
activeConnections,
httpRequestDuration,
httpResponseSize,
)
// 定期更新系统指标
go func() {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for range ticker.C {
var m runtime.MemStats
runtime.ReadMemStats(&m)
// 可以通过自定义 Gauge 记录这些值
log.Printf("Goroutines: %d, Alloc: %d MB",
runtime.NumGoroutine(),
m.Alloc/1024/1024)
}
}()
}
type responseWriter struct {
http.ResponseWriter
statusCode int
size int
}
func (rw *responseWriter) WriteHeader(code int) {
rw.statusCode = code
rw.ResponseWriter.WriteHeader(code)
}
func (rw *responseWriter) Write(b []byte) (int, error) {
size, err := rw.ResponseWriter.Write(b)
rw.size += size
return size, err
}
func monitoringMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// 增加活跃连接数
activeConnections.Inc()
defer activeConnections.Dec()
// 包装 ResponseWriter
wrapped := &responseWriter{
ResponseWriter: w,
statusCode: 200,
}
// 执行请求
next.ServeHTTP(wrapped, r)
// 记录指标
duration := time.Since(start).Seconds()
httpRequestsTotal.WithLabelValues(
r.Method,
r.URL.Path,
strconv.Itoa(wrapped.statusCode),
).Inc()
httpRequestDuration.WithLabelValues(
r.Method,
r.URL.Path,
).Observe(duration)
httpResponseSize.WithLabelValues(
r.URL.Path,
).Observe(float64(wrapped.size))
})
}
func main() {
mux := http.NewServeMux()
// 业务路由
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("Hello, World!"))
})
mux.HandleFunc("/api/users", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.Write([]byte(`{"users": []}`))
})
mux.HandleFunc("/slow", func(w http.ResponseWriter, r *http.Request) {
time.Sleep(2 * time.Second)
w.Write([]byte("Slow response"))
})
// 应用中间件
handler := monitoringMiddleware(mux)
// Prometheus 指标端点
http.Handle("/metrics", promhttp.Handler())
http.Handle("/", handler)
log.Println("Server starting on :8080")
log.Println("Metrics available at :8080/metrics")
http.ListenAndServe(":8080", nil)
}
Prometheus 配置
创建 prometheus.yml:
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'go-app'
static_configs:
- targets: ['localhost:8080']
metrics_path: '/metrics'
scrape_interval: 5s
启动 Prometheus:
docker run -d \
--name prometheus \
-p 9090:9090 \
-v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml \
prom/prometheus
Grafana 可视化
启动 Grafana
docker run -d \
--name grafana \
-p 3000:3000 \
grafana/grafana
常用查询
# 请求速率(每秒)
rate(http_requests_total[5m])
# 错误率
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])
# 平均延迟
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])
# P99 延迟
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
# 活跃连接数
active_connections
# 内存使用
go_memstats_alloc_bytes / 1024 / 1024
告警规则
创建 alert.rules:
groups:
- name: app-alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for more than 5 minutes"
- alert: HighLatency
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "P99 latency is above 1 second"
总结
Prometheus 监控让你的应用具备可观测性:
- Counter:统计请求总数、错误总数
- Gauge:记录当前状态(连接数、内存使用)
- Histogram:统计分布(延迟、响应大小)
- Summary:计算分位数
最佳实践:
- 使用 RED 方法:Rate(速率)、Errors(错误)、Duration(延迟)
- 标签不要太多,避免高基数问题
- 设置合理的告警阈值
- 定期审查和优化指标
记住:没有监控的应用就像在黑暗中航行,Prometheus 是你的灯塔。
继续阅读
探索更多技术文章
浏览归档,发现更多关于系统设计、工具链和工程实践的内容。