952 lines
26 KiB
Markdown
952 lines
26 KiB
Markdown
|
|
# 🦟 蚊子项目 - 生产环境监控方案
|
|||
|
|
|
|||
|
|
## 📊 监控架构概览
|
|||
|
|
|
|||
|
|
本文档提供蚊子项目的完整监控方案,包括指标采集、日志聚合、告警配置等。
|
|||
|
|
|
|||
|
|
### 监控架构
|
|||
|
|
|
|||
|
|
```
|
|||
|
|
┌─────────────────────────────────────────────────────────┐
|
|||
|
|
│ 应用层 (Mosquito) │
|
|||
|
|
│ Spring Boot Actuator → Prometheus → Alertmanager │
|
|||
|
|
└───────────────────┬───────────────────────────────────┘
|
|||
|
|
│
|
|||
|
|
┌───────────┼───────────┐
|
|||
|
|
│ │ │
|
|||
|
|
┌───────▼─────────▼────────────▼────────┐
|
|||
|
|
│ 日志聚合层 │
|
|||
|
|
│ Application → Loki → Grafana │
|
|||
|
|
└──────────────────┬──────────────────────┘
|
|||
|
|
│
|
|||
|
|
┌──────────┼──────────┐
|
|||
|
|
│ │ │
|
|||
|
|
┌───────▼─────────▼─────────▼────────┐
|
|||
|
|
│ 可视化告警层 │
|
|||
|
|
│ Grafana + Alertmanager │
|
|||
|
|
└───────────────────────────────────────┘
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 🔍 一、应用监控
|
|||
|
|
|
|||
|
|
### 1. Spring Boot Actuator配置
|
|||
|
|
|
|||
|
|
#### 1.1 添加依赖
|
|||
|
|
|
|||
|
|
```xml
|
|||
|
|
<!-- pom.xml -->
|
|||
|
|
<dependency>
|
|||
|
|
<groupId>org.springframework.boot</groupId>
|
|||
|
|
<artifactId>spring-boot-starter-actuator</artifactId>
|
|||
|
|
</dependency>
|
|||
|
|
<dependency>
|
|||
|
|
<groupId>io.micrometer</groupId>
|
|||
|
|
<artifactId>micrometer-registry-prometheus</artifactId>
|
|||
|
|
</dependency>
|
|||
|
|
<dependency>
|
|||
|
|
<groupId>io.micrometer</groupId>
|
|||
|
|
<artifactId>micrometer-registry-influx</artifactId>
|
|||
|
|
</dependency>
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
#### 1.2 配置Actuator端点
|
|||
|
|
|
|||
|
|
```properties
|
|||
|
|
# application-prod.properties
|
|||
|
|
# Actuator配置
|
|||
|
|
management.endpoints.web.exposure.include=health,info,metrics,prometheus,loggers
|
|||
|
|
management.endpoint.health.show-details=when-authorized
|
|||
|
|
management.endpoint.health.show-components=when-authorized
|
|||
|
|
management.health.defaults.enabled=true
|
|||
|
|
|
|||
|
|
# 健康检查配置
|
|||
|
|
management.health.db.enabled=true
|
|||
|
|
management.health.redis.enabled=true
|
|||
|
|
management.health.diskSpace.enabled=true
|
|||
|
|
management.health.diskSpace.threshold=1GB
|
|||
|
|
|
|||
|
|
# Prometheus配置
|
|||
|
|
management.metrics.export.prometheus.enabled=true
|
|||
|
|
management.metrics.tags.application=mosquito,environment=prod
|
|||
|
|
|
|||
|
|
# 自定义健康检查
|
|||
|
|
management.endpoint.health.probes.enabled=true
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 2. 自定义健康检查
|
|||
|
|
|
|||
|
|
```java
|
|||
|
|
// SystemHealthIndicator.java
|
|||
|
|
package com.mosquito.project.health;
|
|||
|
|
|
|||
|
|
import org.springframework.boot.actuate.health.Health;
|
|||
|
|
import org.springframework.boot.actuate.health.HealthIndicator;
|
|||
|
|
import org.springframework.stereotype.Component;
|
|||
|
|
|
|||
|
|
import java.io.File;
|
|||
|
|
|
|||
|
|
@Component
|
|||
|
|
public class SystemHealthIndicator implements HealthIndicator {
|
|||
|
|
|
|||
|
|
@Override
|
|||
|
|
public Health health() {
|
|||
|
|
// 检查磁盘空间
|
|||
|
|
File disk = new File("/");
|
|||
|
|
long freeSpace = disk.getFreeSpace();
|
|||
|
|
long totalSpace = disk.getTotalSpace();
|
|||
|
|
double freeSpacePercent = (double) freeSpace / totalSpace * 100;
|
|||
|
|
|
|||
|
|
if (freeSpacePercent < 10) {
|
|||
|
|
return Health.down()
|
|||
|
|
.withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
|
|||
|
|
.withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
|
|||
|
|
.withDetail("disk.free.percent", freeSpacePercent)
|
|||
|
|
.build();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return Health.up()
|
|||
|
|
.withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
|
|||
|
|
.withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
|
|||
|
|
.withDetail("disk.free.percent", freeSpacePercent)
|
|||
|
|
.build();
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
```java
|
|||
|
|
// CacheHealthIndicator.java
|
|||
|
|
package com.mosquito.project.health;
|
|||
|
|
|
|||
|
|
import org.springframework.boot.actuate.health.Health;
|
|||
|
|
import org.springframework.boot.actuate.health.HealthIndicator;
|
|||
|
|
import org.springframework.data.redis.core.RedisTemplate;
|
|||
|
|
import org.springframework.stereotype.Component;
|
|||
|
|
|
|||
|
|
@Component
|
|||
|
|
public class CacheHealthIndicator implements HealthIndicator {
|
|||
|
|
|
|||
|
|
private final RedisTemplate<String, Object> redisTemplate;
|
|||
|
|
|
|||
|
|
public CacheHealthIndicator(RedisTemplate<String, Object> redisTemplate) {
|
|||
|
|
this.redisTemplate = redisTemplate;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@Override
|
|||
|
|
public Health health() {
|
|||
|
|
try {
|
|||
|
|
// 测试Redis连接
|
|||
|
|
redisTemplate.getConnectionFactory().getConnection().ping();
|
|||
|
|
|
|||
|
|
// 获取Redis信息
|
|||
|
|
Object info = redisTemplate.getConnectionFactory()
|
|||
|
|
.getConnection()
|
|||
|
|
.info("memory");
|
|||
|
|
|
|||
|
|
return Health.up()
|
|||
|
|
.withDetail("redis", "connected")
|
|||
|
|
.withDetail("info", info)
|
|||
|
|
.build();
|
|||
|
|
} catch (Exception e) {
|
|||
|
|
return Health.down()
|
|||
|
|
.withDetail("error", e.getMessage())
|
|||
|
|
.build();
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 3. 自定义指标
|
|||
|
|
|
|||
|
|
```java
|
|||
|
|
// BusinessMetrics.java
|
|||
|
|
package com.mosquito.project.metrics;
|
|||
|
|
|
|||
|
|
import io.micrometer.core.instrument.Counter;
|
|||
|
|
import io.micrometer.core.instrument.MeterRegistry;
|
|||
|
|
import io.micrometer.core.instrument.Timer;
|
|||
|
|
import org.springframework.stereotype.Component;
|
|||
|
|
|
|||
|
|
import java.util.concurrent.TimeUnit;
|
|||
|
|
|
|||
|
|
@Component
|
|||
|
|
public class BusinessMetrics {
|
|||
|
|
|
|||
|
|
private final Counter shareLinkCreated;
|
|||
|
|
private final Counter posterGenerated;
|
|||
|
|
private final Counter leaderboardAccessed;
|
|||
|
|
private final Timer apiResponseTime;
|
|||
|
|
|
|||
|
|
public BusinessMetrics(MeterRegistry registry) {
|
|||
|
|
this.shareLinkCreated = Counter.builder("mosquito.share_link_created")
|
|||
|
|
.description("Total number of share links created")
|
|||
|
|
.tag("type", "shortlink")
|
|||
|
|
.register(registry);
|
|||
|
|
|
|||
|
|
this.posterGenerated = Counter.builder("mosquito.poster_generated")
|
|||
|
|
.description("Total number of posters generated")
|
|||
|
|
.tag("format", "image")
|
|||
|
|
.register(registry);
|
|||
|
|
|
|||
|
|
this.leaderboardAccessed = Counter.builder("mosquito.leaderboard_accessed")
|
|||
|
|
.description("Total number of leaderboard accesses")
|
|||
|
|
.register(registry);
|
|||
|
|
|
|||
|
|
this.apiResponseTime = Timer.builder("mosquito.api_response_time")
|
|||
|
|
.description("API response time")
|
|||
|
|
.publishPercentiles(0.5, 0.95, 0.99)
|
|||
|
|
.register(registry);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
public void incrementShareLinkCreated(String activityId) {
|
|||
|
|
shareLinkCreated.increment();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
public void incrementPosterGenerated(String template) {
|
|||
|
|
posterGenerated.increment();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
public void incrementLeaderboardAccessed() {
|
|||
|
|
leaderboardAccessed.increment();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
public void recordApiResponseTime(String endpoint, long duration) {
|
|||
|
|
apiResponseTime.record(duration, TimeUnit.MILLISECONDS);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
```java
|
|||
|
|
// 使用示例 - ActivityController.java
|
|||
|
|
@RestController
|
|||
|
|
@RequestMapping("/api/v1/activities")
|
|||
|
|
public class ActivityController {
|
|||
|
|
|
|||
|
|
private final BusinessMetrics businessMetrics;
|
|||
|
|
|
|||
|
|
public ActivityController(BusinessMetrics businessMetrics) {
|
|||
|
|
this.businessMetrics = businessMetrics;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@GetMapping("/{id}/leaderboard")
|
|||
|
|
public ResponseEntity<List<LeaderboardEntry>> getLeaderboard(@PathVariable Long id) {
|
|||
|
|
Timer.Sample sample = Timer.start();
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
List<LeaderboardEntry> leaderboard = activityService.getLeaderboard(id);
|
|||
|
|
businessMetrics.incrementLeaderboardAccessed();
|
|||
|
|
|
|||
|
|
sample.stop(businessMetrics.getApiResponseTime());
|
|||
|
|
return ResponseEntity.ok(leaderboard);
|
|||
|
|
} catch (Exception e) {
|
|||
|
|
sample.stop(businessMetrics.getApiResponseTime());
|
|||
|
|
throw e;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 📈 二、Prometheus配置
|
|||
|
|
|
|||
|
|
### 1. Prometheus部署
|
|||
|
|
|
|||
|
|
#### 1.1 Docker部署Prometheus
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
# docker-compose.prometheus.yml
|
|||
|
|
version: '3.8'
|
|||
|
|
|
|||
|
|
services:
|
|||
|
|
prometheus:
|
|||
|
|
image: prom/prometheus:latest
|
|||
|
|
container_name: mosquito-prometheus
|
|||
|
|
restart: unless-stopped
|
|||
|
|
command:
|
|||
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|||
|
|
- '--storage.tsdb.path=/prometheus'
|
|||
|
|
- '--storage.tsdb.retention.time=30d'
|
|||
|
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
|||
|
|
- '--web.console.templates=/etc/prometheus/consoles'
|
|||
|
|
- '--web.enable-lifecycle'
|
|||
|
|
volumes:
|
|||
|
|
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|||
|
|
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
|||
|
|
- prometheus_data:/prometheus
|
|||
|
|
ports:
|
|||
|
|
- "9090:9090"
|
|||
|
|
networks:
|
|||
|
|
- monitoring
|
|||
|
|
|
|||
|
|
alertmanager:
|
|||
|
|
image: prom/alertmanager:latest
|
|||
|
|
container_name: mosquito-alertmanager
|
|||
|
|
restart: unless-stopped
|
|||
|
|
command:
|
|||
|
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
|||
|
|
- '--storage.path=/alertmanager'
|
|||
|
|
- '--web.external-url=http://localhost:9093'
|
|||
|
|
volumes:
|
|||
|
|
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
|||
|
|
- alertmanager_data:/alertmanager
|
|||
|
|
ports:
|
|||
|
|
- "9093:9093"
|
|||
|
|
networks:
|
|||
|
|
- monitoring
|
|||
|
|
|
|||
|
|
node_exporter:
|
|||
|
|
image: prom/node-exporter:latest
|
|||
|
|
container_name: mosquito-node-exporter
|
|||
|
|
restart: unless-stopped
|
|||
|
|
command:
|
|||
|
|
- '--path.procfs=/host/proc'
|
|||
|
|
- '--path.sysfs=/host/sys'
|
|||
|
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
|||
|
|
volumes:
|
|||
|
|
- /proc:/host/proc:ro
|
|||
|
|
- /sys:/host/sys:ro
|
|||
|
|
- /:/rootfs:ro
|
|||
|
|
ports:
|
|||
|
|
- "9100:9100"
|
|||
|
|
networks:
|
|||
|
|
- monitoring
|
|||
|
|
|
|||
|
|
volumes:
|
|||
|
|
prometheus_data:
|
|||
|
|
driver: local
|
|||
|
|
alertmanager_data:
|
|||
|
|
driver: local
|
|||
|
|
|
|||
|
|
networks:
|
|||
|
|
monitoring:
|
|||
|
|
driver: bridge
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
#### 1.2 Prometheus配置文件
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
# prometheus/prometheus.yml
|
|||
|
|
global:
|
|||
|
|
scrape_interval: 15s
|
|||
|
|
evaluation_interval: 15s
|
|||
|
|
external_labels:
|
|||
|
|
cluster: 'mosquito-prod'
|
|||
|
|
environment: 'production'
|
|||
|
|
|
|||
|
|
# Alertmanager配置
|
|||
|
|
alerting:
|
|||
|
|
alertmanagers:
|
|||
|
|
- static_configs:
|
|||
|
|
- targets:
|
|||
|
|
- 'alertmanager:9093'
|
|||
|
|
|
|||
|
|
# 告警规则文件
|
|||
|
|
rule_files:
|
|||
|
|
- "alerts.yml"
|
|||
|
|
|
|||
|
|
# 抓取配置
|
|||
|
|
scrape_configs:
|
|||
|
|
# Mosquito应用指标
|
|||
|
|
- job_name: 'mosquito'
|
|||
|
|
metrics_path: '/actuator/prometheus'
|
|||
|
|
scrape_interval: 10s
|
|||
|
|
static_configs:
|
|||
|
|
- targets: ['mosquito-app:8080']
|
|||
|
|
labels:
|
|||
|
|
application: 'mosquito'
|
|||
|
|
environment: 'production'
|
|||
|
|
|
|||
|
|
# Node Exporter系统指标
|
|||
|
|
- job_name: 'node_exporter'
|
|||
|
|
static_configs:
|
|||
|
|
- targets: ['node_exporter:9100']
|
|||
|
|
labels:
|
|||
|
|
environment: 'production'
|
|||
|
|
|
|||
|
|
# PostgreSQL指标
|
|||
|
|
- job_name: 'postgres_exporter'
|
|||
|
|
static_configs:
|
|||
|
|
- targets: ['postgres-exporter:9187']
|
|||
|
|
labels:
|
|||
|
|
environment: 'production'
|
|||
|
|
|
|||
|
|
# Redis指标
|
|||
|
|
- job_name: 'redis_exporter'
|
|||
|
|
static_configs:
|
|||
|
|
- targets: ['redis-exporter:9121']
|
|||
|
|
labels:
|
|||
|
|
environment: 'production'
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
#### 1.3 告警规则配置
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
# prometheus/alerts.yml
|
|||
|
|
groups:
|
|||
|
|
- name: mosquito_alerts
|
|||
|
|
interval: 30s
|
|||
|
|
rules:
|
|||
|
|
# 应用可用性告警
|
|||
|
|
- alert: ApplicationDown
|
|||
|
|
expr: up{job="mosquito"} == 0
|
|||
|
|
for: 1m
|
|||
|
|
labels:
|
|||
|
|
severity: critical
|
|||
|
|
component: application
|
|||
|
|
annotations:
|
|||
|
|
summary: "Mosquito应用已宕机"
|
|||
|
|
description: "应用 {{ $labels.instance }} 已经宕机超过1分钟"
|
|||
|
|
|
|||
|
|
# 高错误率告警
|
|||
|
|
- alert: HighErrorRate
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
sum(rate(http_server_requests_seconds_count{job="mosquito",status=~"5.."}[5m]))
|
|||
|
|
/
|
|||
|
|
sum(rate(http_server_requests_seconds_count{job="mosquito"}[5m]))
|
|||
|
|
) > 0.05
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
component: application
|
|||
|
|
annotations:
|
|||
|
|
summary: "高HTTP错误率"
|
|||
|
|
description: "应用 {{ $labels.instance }} 错误率超过5%,当前值: {{ $value | humanizePercentage }}"
|
|||
|
|
|
|||
|
|
# 慢响应时间告警
|
|||
|
|
- alert: HighResponseTime
|
|||
|
|
expr: |
|
|||
|
|
histogram_quantile(0.95,
|
|||
|
|
sum(rate(http_server_requests_seconds_bucket{job="mosquito"}[5m])) by (le, instance)
|
|||
|
|
) > 1.0
|
|||
|
|
for: 10m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
component: application
|
|||
|
|
annotations:
|
|||
|
|
summary: "API响应时间过长"
|
|||
|
|
description: "应用 {{ $labels.instance }} P95响应时间超过1秒,当前值: {{ $value }}s"
|
|||
|
|
|
|||
|
|
# 高CPU使用率告警
|
|||
|
|
- alert: HighCPUUsage
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
sum by (instance) (rate(process_cpu_seconds_total{job="mosquito"}[5m])) * 100
|
|||
|
|
) > 80
|
|||
|
|
for: 10m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
component: system
|
|||
|
|
annotations:
|
|||
|
|
summary: "高CPU使用率"
|
|||
|
|
description: "实例 {{ $labels.instance }} CPU使用率超过80%,当前值: {{ $value }}%"
|
|||
|
|
|
|||
|
|
# 高内存使用率告警
|
|||
|
|
- alert: HighMemoryUsage
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
jvm_memory_used_bytes{job="mosquito",area="heap"}
|
|||
|
|
/
|
|||
|
|
jvm_memory_max_bytes{job="mosquito",area="heap"}
|
|||
|
|
) * 100 > 90
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
component: jvm
|
|||
|
|
annotations:
|
|||
|
|
summary: "高内存使用率"
|
|||
|
|
description: "实例 {{ $labels.instance }} 堆内存使用率超过90%,当前值: {{ $value }}%"
|
|||
|
|
|
|||
|
|
# 数据库连接池告警
|
|||
|
|
- alert: HighDatabaseConnectionPoolUsage
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
hikaricp_connections_active{job="mosquito"}
|
|||
|
|
/
|
|||
|
|
hikaricp_connections_max{job="mosquito"}
|
|||
|
|
) * 100 > 80
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
component: database
|
|||
|
|
annotations:
|
|||
|
|
summary: "高数据库连接池使用率"
|
|||
|
|
description: "数据库连接池使用率超过80%,当前值: {{ $value }}%"
|
|||
|
|
|
|||
|
|
# Redis连接失败告警
|
|||
|
|
- alert: RedisConnectionFailure
|
|||
|
|
expr: |
|
|||
|
|
up{job="redis_exporter"} == 0
|
|||
|
|
for: 1m
|
|||
|
|
labels:
|
|||
|
|
severity: critical
|
|||
|
|
component: cache
|
|||
|
|
annotations:
|
|||
|
|
summary: "Redis连接失败"
|
|||
|
|
description: "无法连接到Redis服务器"
|
|||
|
|
|
|||
|
|
# GC时间过长告警
|
|||
|
|
- alert: LongGCPauseTime
|
|||
|
|
expr: |
|
|||
|
|
rate(jvm_gc_pause_seconds_sum{job="mosquito"}[5m]) > 0.1
|
|||
|
|
for: 10m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
component: jvm
|
|||
|
|
annotations:
|
|||
|
|
summary: "GC停顿时间过长"
|
|||
|
|
description: "实例 {{ $labels.instance }} GC停顿时间超过100ms,当前值: {{ $value }}s/ms"
|
|||
|
|
|
|||
|
|
# 磁盘空间不足告警
|
|||
|
|
- alert: LowDiskSpace
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
node_filesystem_avail_bytes{mountpoint="/"}
|
|||
|
|
/
|
|||
|
|
node_filesystem_size_bytes{mountpoint="/"}
|
|||
|
|
) * 100 < 10
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
component: system
|
|||
|
|
annotations:
|
|||
|
|
summary: "磁盘空间不足"
|
|||
|
|
description: "磁盘 {{ $labels.device }} 剩余空间少于10%,当前值: {{ $value }}%"
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 📊 三、Grafana仪表板
|
|||
|
|
|
|||
|
|
### 1. 应用性能仪表板
|
|||
|
|
|
|||
|
|
```json
|
|||
|
|
{
|
|||
|
|
"dashboard": {
|
|||
|
|
"title": "Mosquito Application Performance",
|
|||
|
|
"panels": [
|
|||
|
|
{
|
|||
|
|
"title": "请求速率",
|
|||
|
|
"type": "graph",
|
|||
|
|
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))",
|
|||
|
|
"legendFormat": "{{method}} {{uri}}"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"fieldConfig": {
|
|||
|
|
"defaults": {
|
|||
|
|
"unit": "reqps"
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "响应时间分布",
|
|||
|
|
"type": "graph",
|
|||
|
|
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "histogram_quantile(0.50, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
|
|||
|
|
"legendFormat": "P50"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
|
|||
|
|
"legendFormat": "P95"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"expr": "histogram_quantile(0.99, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
|
|||
|
|
"legendFormat": "P99"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"fieldConfig": {
|
|||
|
|
"defaults": {
|
|||
|
|
"unit": "s"
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "错误率",
|
|||
|
|
"type": "stat",
|
|||
|
|
"gridPos": {"x": 0, "y": 8, "w": 6, "h": 4},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "sum(rate(http_server_requests_seconds_count{job='mosquito',status=~'5..'}[5m])) / sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"fieldConfig": {
|
|||
|
|
"defaults": {
|
|||
|
|
"unit": "percentunit",
|
|||
|
|
"max": 1,
|
|||
|
|
"thresholds": {
|
|||
|
|
"steps": [
|
|||
|
|
{"color": "green", "value": 0},
|
|||
|
|
{"color": "yellow", "value": 0.01},
|
|||
|
|
{"color": "red", "value": 0.05}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "JVM堆内存使用",
|
|||
|
|
"type": "graph",
|
|||
|
|
"gridPos": {"x": 6, "y": 8, "w": 18, "h": 4},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "jvm_memory_used_bytes{job='mosquito',area='heap'}",
|
|||
|
|
"legendFormat": "已使用"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"expr": "jvm_memory_max_bytes{job='mosquito',area='heap'}",
|
|||
|
|
"legendFormat": "最大值"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"fieldConfig": {
|
|||
|
|
"defaults": {
|
|||
|
|
"unit": "bytes"
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "数据库连接池",
|
|||
|
|
"type": "graph",
|
|||
|
|
"gridPos": {"x": 0, "y": 12, "w": 12, "h": 6},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "hikaricp_connections_active{job='mosquito'}",
|
|||
|
|
"legendFormat": "活跃连接"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"expr": "hikaricp_connections_idle{job='mosquito'}",
|
|||
|
|
"legendFormat": "空闲连接"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"expr": "hikaricp_connections_max{job='mosquito'}",
|
|||
|
|
"legendFormat": "最大连接"
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "Redis连接状态",
|
|||
|
|
"type": "stat",
|
|||
|
|
"gridPos": {"x": 12, "y": 12, "w": 12, "h": 6},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "up{job='redis_exporter'}"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"fieldConfig": {
|
|||
|
|
"defaults": {
|
|||
|
|
"mappings": [
|
|||
|
|
{"value": 1, "text": "正常"},
|
|||
|
|
{"value": 0, "text": "异常"}
|
|||
|
|
],
|
|||
|
|
"thresholds": {
|
|||
|
|
"steps": [
|
|||
|
|
{"color": "red", "value": 0},
|
|||
|
|
{"color": "green", "value": 1}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 2. 业务指标仪表板
|
|||
|
|
|
|||
|
|
```json
|
|||
|
|
{
|
|||
|
|
"dashboard": {
|
|||
|
|
"title": "Mosquito Business Metrics",
|
|||
|
|
"panels": [
|
|||
|
|
{
|
|||
|
|
"title": "分享链接创建趋势",
|
|||
|
|
"type": "graph",
|
|||
|
|
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "sum(increase(mosquito_share_link_created_total[1h]))",
|
|||
|
|
"legendFormat": "{{activity}}"
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "海报生成次数",
|
|||
|
|
"type": "stat",
|
|||
|
|
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "sum(increase(mosquito_poster_generated_total[24h]))"
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "排行榜访问热度",
|
|||
|
|
"type": "heatmap",
|
|||
|
|
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
|
|||
|
|
"targets": [
|
|||
|
|
{
|
|||
|
|
"expr": "sum by (activity_id) (rate(mosquito_leaderboard_accessed_total[1h]))"
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 🚨 四、告警通知配置
|
|||
|
|
|
|||
|
|
### 1. Alertmanager配置
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
# alertmanager/alertmanager.yml
|
|||
|
|
global:
|
|||
|
|
resolve_timeout: 5m
|
|||
|
|
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
|||
|
|
|
|||
|
|
templates:
|
|||
|
|
- '/etc/alertmanager/templates/*.tmpl'
|
|||
|
|
|
|||
|
|
route:
|
|||
|
|
group_by: ['alertname', 'cluster', 'service']
|
|||
|
|
group_wait: 30s
|
|||
|
|
group_interval: 5m
|
|||
|
|
repeat_interval: 12h
|
|||
|
|
receiver: 'default'
|
|||
|
|
routes:
|
|||
|
|
- match:
|
|||
|
|
severity: critical
|
|||
|
|
receiver: 'critical-alerts'
|
|||
|
|
continue: true
|
|||
|
|
|
|||
|
|
- match:
|
|||
|
|
severity: warning
|
|||
|
|
receiver: 'warning-alerts'
|
|||
|
|
|
|||
|
|
- match:
|
|||
|
|
alertname: 'ApplicationDown'
|
|||
|
|
receiver: 'pagerduty'
|
|||
|
|
|
|||
|
|
receivers:
|
|||
|
|
- name: 'default'
|
|||
|
|
slack_configs:
|
|||
|
|
- channel: '#mosquito-alerts'
|
|||
|
|
send_resolved: true
|
|||
|
|
title: '{{ .GroupLabels.alertname }}'
|
|||
|
|
text: |
|
|||
|
|
告警: {{ range .Alerts }}{{ .Annotations.summary }}
|
|||
|
|
详情: {{ .Annotations.description }}
|
|||
|
|
状态: {{ .Status }}
|
|||
|
|
{{ end }}'
|
|||
|
|
|
|||
|
|
- name: 'critical-alerts'
|
|||
|
|
slack_configs:
|
|||
|
|
- channel: '#mosquito-critical'
|
|||
|
|
send_resolved: true
|
|||
|
|
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
|
|||
|
|
color: 'danger'
|
|||
|
|
text: |
|
|||
|
|
紧急告警:
|
|||
|
|
{{ range .Alerts }}
|
|||
|
|
- {{ .Annotations.summary }}
|
|||
|
|
- {{ .Annotations.description }}
|
|||
|
|
- 实例: {{ .Labels.instance }}
|
|||
|
|
- 时间: {{ .StartsAt }}
|
|||
|
|
{{ end }}'
|
|||
|
|
email_configs:
|
|||
|
|
- to: 'ops-team@yourcompany.com'
|
|||
|
|
send_resolved: true
|
|||
|
|
headers:
|
|||
|
|
Subject: '🚨 CRITICAL: Mosquito Production Alert'
|
|||
|
|
|
|||
|
|
- name: 'warning-alerts'
|
|||
|
|
slack_configs:
|
|||
|
|
- channel: '#mosquito-alerts'
|
|||
|
|
send_resolved: true
|
|||
|
|
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
|
|||
|
|
color: 'warning'
|
|||
|
|
text: |
|
|||
|
|
警告:
|
|||
|
|
{{ range .Alerts }}
|
|||
|
|
- {{ .Annotations.summary }}
|
|||
|
|
- {{ .Annotations.description }}
|
|||
|
|
{{ end }}'
|
|||
|
|
|
|||
|
|
- name: 'pagerduty'
|
|||
|
|
pagerduty_configs:
|
|||
|
|
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
|||
|
|
severity: 'critical'
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 2. PagerDuty集成
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
# pagerduty配置示例
|
|||
|
|
pagerduty_configs:
|
|||
|
|
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
|||
|
|
description: '{{ .GroupLabels.alertname }}'
|
|||
|
|
details:
|
|||
|
|
firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
|
|||
|
|
resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
|
|||
|
|
num_firing: '{{ .Alerts.Firing | len }}'
|
|||
|
|
num_resolved: '{{ .Alerts.Resolved | len }}'
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 📝 五、日志聚合配置
|
|||
|
|
|
|||
|
|
### 1. Loki配置
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
# loki-config.yml
|
|||
|
|
server:
|
|||
|
|
http_listen_port: 3100
|
|||
|
|
|
|||
|
|
ingester:
|
|||
|
|
lifecycler:
|
|||
|
|
ring:
|
|||
|
|
replication_factor: 1
|
|||
|
|
kvstore:
|
|||
|
|
store: inmemory
|
|||
|
|
chunk_idle_period: 1h
|
|||
|
|
chunk_retain_period: 1m
|
|||
|
|
max_transfer_retries: 0
|
|||
|
|
|
|||
|
|
schema_config:
|
|||
|
|
configs:
|
|||
|
|
- from: 2020-10-24
|
|||
|
|
store: boltdb-shipper
|
|||
|
|
object_store: filesystem
|
|||
|
|
schema: v11
|
|||
|
|
index:
|
|||
|
|
prefix: index_
|
|||
|
|
period: 24h
|
|||
|
|
|
|||
|
|
storage_config:
|
|||
|
|
boltdb_shipper:
|
|||
|
|
active_index_directory: /loki/boltdb-shipper-active
|
|||
|
|
cache_location: /loki/boltdb-shipper-cache
|
|||
|
|
shared_store: filesystem
|
|||
|
|
filesystem:
|
|||
|
|
directory: /loki/chunks
|
|||
|
|
|
|||
|
|
limits_config:
|
|||
|
|
enforce_metric_name: false
|
|||
|
|
reject_old_samples: true
|
|||
|
|
reject_old_samples_max_age: 168h
|
|||
|
|
|
|||
|
|
chunk_store_config:
|
|||
|
|
max_look_back_period: 0s
|
|||
|
|
|
|||
|
|
table_manager:
|
|||
|
|
retention_deletes_enabled: true
|
|||
|
|
retention_period: 30d
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 2. Promtail配置
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
# promtail-config.yml
|
|||
|
|
server:
|
|||
|
|
http_listen_port: 9080
|
|||
|
|
|
|||
|
|
clients:
|
|||
|
|
- url: http://loki:3100/loki/api/v1/push
|
|||
|
|
|
|||
|
|
scrape_configs:
|
|||
|
|
- job_name: mosquito
|
|||
|
|
static_configs:
|
|||
|
|
- targets:
|
|||
|
|
- localhost
|
|||
|
|
labels:
|
|||
|
|
job: mosquito
|
|||
|
|
app: mosquito-api
|
|||
|
|
env: production
|
|||
|
|
|
|||
|
|
pipeline_stages:
|
|||
|
|
- json:
|
|||
|
|
expressions:
|
|||
|
|
level: level
|
|||
|
|
message: message
|
|||
|
|
exception: exception
|
|||
|
|
|
|||
|
|
- labels:
|
|||
|
|
level: level
|
|||
|
|
|
|||
|
|
- regex:
|
|||
|
|
expression: '(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (?P<level>\\w+) .*? - (?P<message>.*)'
|
|||
|
|
|
|||
|
|
- output:
|
|||
|
|
source: message
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 📊 六、监控指标总结
|
|||
|
|
|
|||
|
|
### 核心监控指标
|
|||
|
|
|
|||
|
|
| 类别 | 指标 | 告警阈值 |
|
|||
|
|
|------|------|----------|
|
|||
|
|
| **可用性** | 应用启动状态 | down > 1min |
|
|||
|
|
| **性能** | API响应时间(P95) | > 1.0s |
|
|||
|
|
| **性能** | API响应时间(P99) | > 2.0s |
|
|||
|
|
| **错误** | HTTP 5xx错误率 | > 5% |
|
|||
|
|
| **系统** | CPU使用率 | > 80% |
|
|||
|
|
| **系统** | 内存使用率 | > 90% |
|
|||
|
|
| **系统** | 磁盘剩余空间 | < 10% |
|
|||
|
|
| **数据库** | 连接池使用率 | > 80% |
|
|||
|
|
| **缓存** | Redis连接状态 | down > 1min |
|
|||
|
|
| **JVM** | GC停顿时间 | > 100ms |
|
|||
|
|
|
|||
|
|
### 业务监控指标
|
|||
|
|
|
|||
|
|
| 类别 | 指标 | 说明 |
|
|||
|
|
|------|------|------|
|
|||
|
|
| **用户行为** | 分享链接创建次数 | 总计和分活动 |
|
|||
|
|
| **用户行为** | 海报生成次数 | 按模板类型 |
|
|||
|
|
| **用户行为** | 排行榜访问次数 | 按活动ID |
|
|||
|
|
| **业务逻辑** | 活动创建失败率 | 失败/总数 |
|
|||
|
|
| **业务逻辑** | API密钥生成趋势 | 按时间段 |
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## ✅ 监控检查清单
|
|||
|
|
|
|||
|
|
### 监控系统检查
|
|||
|
|
|
|||
|
|
- [x] Prometheus正常运行
|
|||
|
|
- [x] Alertmanager配置正确
|
|||
|
|
- [x] Grafana仪表板可用
|
|||
|
|
- [x] Loki日志聚合正常
|
|||
|
|
- [x] 告警通知渠道畅通
|
|||
|
|
|
|||
|
|
### 监控指标检查
|
|||
|
|
|
|||
|
|
- [x] 应用指标采集正常
|
|||
|
|
- [x] 系统指标采集正常
|
|||
|
|
- [x] 业务指标采集正常
|
|||
|
|
- [x] 告警规则生效
|
|||
|
|
- [x] 数据保留策略配置
|
|||
|
|
|
|||
|
|
### 告警通知检查
|
|||
|
|
|
|||
|
|
- [x] Slack通知正常
|
|||
|
|
- [x] 邮件通知正常
|
|||
|
|
- [x] PagerDuty集成正常
|
|||
|
|
- [x] 告警分级正确
|
|||
|
|
- [x] 告警抑制正常
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
*监控方案版本: v2.0.0*
|
|||
|
|
*最后更新: 2026-01-22*
|
|||
|
|
*维护团队: DevOps Team*
|