返回 筑基・网络云路秘径
网络可观测性建设
博主
大约 6 分钟
网络可观测性建设
一、问题引入:故障排查的黑盒困境
1.1 真实案例:诡异的间歇性超时
场景:微服务架构,订单服务间歇性超时
痛点:
┌─────────────────────────────────────────────────────────────┐
│ 现象: │
│ - 每天约1%的请求超时(超过5秒) │
│ - 无规律可循,难以复现 │
│ - 日志分散,无法关联 │
├─────────────────────────────────────────────────────────────┤
│ 传统排查方式(失败): │
│ - 查看应用日志:无异常 │
│ - 查看服务器监控:CPU/内存正常 │
│ - 查看数据库:无慢查询 │
│ - 无法定位问题根源 │
├─────────────────────────────────────────────────────────────┤
│ 可观测性改造后: │
│ - 分布式追踪发现:调用链中某个服务偶发延迟 │
│ - 网络监控发现:特定时段网络抖动 │
│ - 根因:上游CDN节点在整点刷新缓存导致延迟 │
└─────────────────────────────────────────────────────────────┘
二、可观测性三大支柱
可观测性三大支柱:
┌──────────────────────────────────────────────────────────────┐
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ 日志 │ │ 指标 │ │ 追踪 │ │
│ │ (Logging) │ │ (Metrics) │ │ (Tracing) │ │
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ 日志:离散事件记录 │ │
│ │ - 错误日志、访问日志 │ │
│ │ - 结构化日志(JSON) │ │
│ │ - 日志聚合(ELK/Loki) │ │
│ ├──────────────────────────────────────────────────────┤ │
│ │ 指标:可聚合的数值 │ │
│ │ - Counter/Gauge/Histogram │ │
│ │ - 时序数据库(Prometheus) │ │
│ │ - 告警规则 │ │
│ ├──────────────────────────────────────────────────────┤ │
│ │ 追踪:请求链路 │ │
│ │ - 分布式追踪(OpenTelemetry) │ │
│ │ - 调用链分析 │ │
│ │ - 性能瓶颈定位 │ │
│ └──────────────────────────────────────────────────────┘ │
│ │
└──────────────────────────────────────────────────────────────┘
三、指标监控(Metrics)
3.1 RED方法
RED方法(面向请求的服务):
┌──────────────────────────────────────────────────────────────┐
│ │
│ R - Rate(请求率) │
│ 每秒处理的请求数 │
│ 例:http_requests_total │
│ │
│ E - Errors(错误率) │
│ 每秒失败的请求数 │
│ 例:http_requests_errors_total │
│ │
│ D - Duration(延迟) │
│ 请求处理时间分布 │
│ 例:http_request_duration_seconds │
│ │
└──────────────────────────────────────────────────────────────┘
3.2 Prometheus + Grafana
# Prometheus配置
scrape_configs:
- job_name: 'api-gateway'
static_configs:
- targets: ['gateway:8080']
metrics_path: '/actuator/prometheus'
scrape_interval: 15s
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
/**
* Micrometer指标埋点
*/
@Component
public class MetricsService {
private final Counter requestCounter;
private final Timer requestTimer;
private final Gauge activeConnections;
public MetricsService(MeterRegistry registry) {
this.requestCounter = Counter.builder("http.requests.total")
.description("Total HTTP requests")
.register(registry);
this.requestTimer = Timer.builder("http.request.duration")
.description("HTTP request duration")
.publishPercentiles(0.5, 0.95, 0.99)
.register(registry);
this.activeConnections = Gauge.builder("http.connections.active")
.description("Active HTTP connections")
.register(registry, this, MetricsService::getActiveConnections);
}
public void recordRequest(String path, long durationMs, boolean success) {
requestCounter.increment();
requestTimer.record(durationMs, TimeUnit.MILLISECONDS);
}
}
四、分布式追踪(Tracing)
4.1 OpenTelemetry
/**
* OpenTelemetry追踪
*/
@Component
public class TracingService {
private final Tracer tracer;
public TracingService(OpenTelemetry openTelemetry) {
this.tracer = openTelemetry.getTracer("order-service");
}
public Order createOrder(CreateOrderRequest request) {
Span span = tracer.spanBuilder("createOrder")
.setSpanKind(SpanKind.SERVER)
.startSpan();
try (Scope scope = span.makeCurrent()) {
span.setAttribute("order.userId", request.getUserId());
span.setAttribute("order.amount", request.getAmount());
// 验证库存
validateInventory(request, span);
// 创建订单
Order order = orderRepository.save(request);
span.setAttribute("order.id", order.getId());
span.setStatus(StatusCode.OK);
return order;
} catch (Exception e) {
span.recordException(e);
span.setStatus(StatusCode.ERROR, e.getMessage());
throw e;
} finally {
span.end();
}
}
private void validateInventory(CreateOrderRequest request, Span parentSpan) {
Span span = tracer.spanBuilder("validateInventory")
.setParent(Context.current().with(parentSpan))
.startSpan();
try {
// 库存验证逻辑
inventoryService.checkStock(request.getItems());
} finally {
span.end();
}
}
}
4.2 Jaeger部署
# docker-compose.yml
version: '3'
services:
jaeger:
image: jaegertracing/all-in-one:1.45
ports:
- "16686:16686" # UI
- "14268:14268" # Collector
environment:
- COLLECTOR_OTLP_ENABLED=true
五、日志聚合
5.1 ELK Stack
# Filebeat配置
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/app/*.log
fields:
service: order-service
multiline.pattern: '^\['
multiline.negate: true
multiline.match: after
output.elasticsearch:
hosts: ["elasticsearch:9200"]
index: "app-logs-%{+yyyy.MM.dd}"
5.2 结构化日志
/**
* 结构化日志
*/
@Slf4j
@Component
public class StructuredLogging {
private static final ObjectMapper mapper = new ObjectMapper();
public void logRequest(HttpServletRequest request, long duration) {
try {
LogEntry entry = LogEntry.builder()
.timestamp(Instant.now())
.level("INFO")
.service("order-service")
.traceId(MDC.get("traceId"))
.spanId(MDC.get("spanId"))
.method(request.getMethod())
.path(request.getRequestURI())
.durationMs(duration)
.statusCode(HttpStatus.OK.value())
.build();
log.info(mapper.writeValueAsString(entry));
} catch (JsonProcessingException e) {
log.error("Failed to serialize log entry", e);
}
}
@Data
@Builder
public static class LogEntry {
private Instant timestamp;
private String level;
private String service;
private String traceId;
private String spanId;
private String method;
private String path;
private long durationMs;
private int statusCode;
}
}
六、网络可观测性
6.1 eBPF网络监控
# eBPF网络监控(BCC工具)
from bcc import BPF
# 监控TCP连接延迟
bpf_code = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>
BPF_HASH(start, u32);
BPF_HISTOGRAM(dist);
int trace_tcp_connect(struct pt_regs *ctx, struct sock *sk) {
u32 pid = bpf_get_current_pid_tgid();
u64 ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
return 0;
}
int trace_tcp_connect_ret(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid();
u64 *tsp = start.lookup(&pid);
if (tsp != 0) {
u64 delta = bpf_ktime_get_ns() - *tsp;
dist.increment(bpf_log2l(delta / 1000000)); // ms
start.delete(&pid);
}
return 0;
}
"""
b = BPF(text=bpf_code)
b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_tcp_connect")
b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_tcp_connect_ret")
print("Tracing TCP connect latency... Hit Ctrl-C to end.")
b["dist"].print_log2_hist("ms")
6.2 网络拓扑可视化
# Cilium Hubble
apiVersion: cilium.io/v2alpha1
kind: HubbleRelay
metadata:
name: hubble-relay
spec:
listenAddress: ":4245"
dialTimeout: 30s
retryTimeout: 30s
七、告警与响应
# Prometheus告警规则
groups:
- name: network-alerts
rules:
- alert: HighLatency
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "99th percentile latency is {{ $value }}s"
- alert: HighErrorRate
expr: rate(http_requests_errors_total[5m]) / rate(http_requests_total[5m]) > 0.01
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate"
description: "Error rate is {{ $value | humanizePercentage }}"
系列上一篇:边缘计算网络架构
系列下一篇:IPv6迁移与部署
知识点测试
读完文章了?来测试一下你对知识点的掌握程度吧!
评论区
使用 GitHub 账号登录后即可发表评论,支持 Markdown 格式。
如果评论系统无法加载,请确保:
- 您的网络可以访问 GitHub
- giscus GitHub App 已安装到仓库
- 仓库已启用 Discussions 功能