Files
user-system/docs/DEPLOYMENT.md

23 KiB
Raw Permalink Blame History

部署和运维指南

概述

本文档描述用户管理系统的部署方案和运维规范,包括容器化部署、集群部署、监控告警、日志管理等。


1. 部署方案

1.1 系统架构

┌─────────────────────────────────────────────────────────┐
│                      负载均衡 (Nginx)                   │
└────────────────────┬────────────────────────────────────┘
                     │
        ┌────────────┴────────────┐
        │                         │
┌───────▼────────┐     ┌────────▼────────┐
│   应用实例 1   │     │   应用实例 N   │
│   (Port 8080)  │     │   (Port 8080)  │
└───────┬────────┘     └────────┬────────┘
        │                       │
        └───────────┬───────────┘
                    │
        ┌───────────┴───────────┐
        │                       │
┌───────▼────────┐    ┌────────▼────────┐
│   MySQL        │    │   Redis         │
│   (主从复制)   │    │   (哨兵模式)    │
└────────────────┘    └─────────────────┘

1.3 Docker 部署

单机 Docker 部署

docker-compose.yml单机版

version: '3.8'

services:
  user-management:
    image: user-management-system:1.0.0
    container_name: user-ms
    ports:
      - "8080:8080"
    volumes:
      - ./data:/app/data
      - ./config:/app/config
      - ./logs:/app/logs
    environment:
      - SPRING_PROFILES_ACTIVE=docker
      - DATABASE_TYPE=sqlite
      - DATABASE_PATH=/app/data/user_management.db
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health/ready"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

启动命令

# 启动
docker-compose up -d

# 查看日志
docker-compose logs -f

# 停止
docker-compose down

# 停止并删除数据
docker-compose down -v

集群 Docker 部署

目录结构

deployment/
├── docker/
│   ├── auth-service/
│   │   └── Dockerfile
│   ├── user-service/
│   │   └── Dockerfile
│   ├── permission-service/
│   │   └── Dockerfile
│   └── gateway/
│       └── Dockerfile
├── docker-compose.yml
├── docker-compose.prod.yml
└── init/
    └── init.sql

Dockerfile 示例Go

# 构建阶段
FROM golang:1.21-alpine AS builder

WORKDIR /app

# 复制依赖文件
COPY go.mod go.sum ./
RUN go mod download

# 复制源代码
COPY . .

# 编译
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
    -ldflags="-w -s" \
    -o user-service \
    ./cmd/user-service

# 运行阶段
FROM alpine:latest

RUN apk --no-cache add ca-certificates tzdata

WORKDIR /app

COPY --from=builder /app/user-service .

EXPOSE 8080

CMD ["./user-service"]

docker-compose.yml

version: '3.8'

services:
  mysql:
    image: mysql:8.0
    container_name: user-ms-mysql
    environment:
      MYSQL_ROOT_PASSWORD: root_password
      MYSQL_DATABASE: user_management
      MYSQL_USER: app_user
      MYSQL_PASSWORD: app_password
    ports:
      - "3306:3306"
    volumes:
      - mysql-data:/var/lib/mysql
      - ./init/init.sql:/docker-entrypoint-initdb.d/init.sql
    networks:
      - user-ms-network

  redis:
    image: redis:7-alpine
    container_name: user-ms-redis
    ports:
      - "6379:6379"
    volumes:
      - redis-data:/data
    networks:
      - user-ms-network

  auth-service:
    build:
      context: ./docker/auth-service
    container_name: user-ms-auth-service
    environment:
      - SPRING_PROFILES_ACTIVE=prod
      - DB_HOST=mysql
      - DB_PORT=3306
      - DB_NAME=user_management
      - DB_USER=app_user
      - DB_PASSWORD=app_password
      - REDIS_HOST=redis
      - REDIS_PORT=6379
    ports:
      - "8081:8080"
    depends_on:
      - mysql
      - redis
    networks:
      - user-ms-network

  user-service:
    build:
      context: ./docker/user-service
    container_name: user-ms-user-service
    environment:
      - SPRING_PROFILES_ACTIVE=prod
      - DB_HOST=mysql
      - DB_PORT=3306
      - DB_NAME=user_management
      - DB_USER=app_user
      - DB_PASSWORD=app_password
      - REDIS_HOST=redis
      - REDIS_PORT=6379
    ports:
      - "8082:8080"
    depends_on:
      - mysql
      - redis
    networks:
      - user-ms-network

  permission-service:
    build:
      context: ./docker/permission-service
    container_name: user-ms-permission-service
    environment:
      - SPRING_PROFILES_ACTIVE=prod
      - DB_HOST=mysql
      - DB_PORT=3306
      - DB_NAME=user_management
      - DB_USER=app_user
      - DB_PASSWORD=app_password
      - REDIS_HOST=redis
      - REDIS_PORT=6379
    ports:
      - "8083:8080"
    depends_on:
      - mysql
      - redis
    networks:
      - user-ms-network

  gateway:
    build:
      context: ./docker/gateway
    container_name: user-ms-gateway
    environment:
      - AUTH_SERVICE_URL=http://auth-service:8080
      - USER_SERVICE_URL=http://user-service:8080
      - PERMISSION_SERVICE_URL=http://permission-service:8080
    ports:
      - "8080:8080"
    depends_on:
      - auth-service
      - user-service
      - permission-service
    networks:
      - user-ms-network

  prometheus:
    image: prom/prometheus:latest
    container_name: user-ms-prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./deployment/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus-data:/prometheus
    networks:
      - user-ms-network

  grafana:
    image: grafana/grafana:latest
    container_name: user-ms-grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana-data:/var/lib/grafana
      - ./deployment/grafana/provisioning:/etc/grafana/provisioning
    networks:
      - user-ms-network

volumes:
  mysql-data:
  redis-data:
  prometheus-data:
  grafana-data:

networks:
  user-ms-network:
    driver: bridge

启动命令

# 开发环境
docker-compose up -d

# 生产环境
docker-compose -f docker-compose.prod.yml up -d

# 查看日志
docker-compose logs -f

# 停止服务
docker-compose down

# 清理数据
docker-compose down -v

1.3 Kubernetes 部署

Helm Charts 结构

deployment/kubernetes/helm/user-management-system/
├── Chart.yaml
├── values.yaml
├── values-prod.yaml
└── templates/
    ├── _helpers.tpl
    ├── deployment.yaml
    ├── service.yaml
    ├── ingress.yaml
    ├── configmap.yaml
    ├── secret.yaml
    ├── hpa.yaml
    └── pdb.yaml

values.yaml

# 默认配置
replicaCount: 2

image:
  repository: example.com/user-management-system
  pullPolicy: IfNotPresent
  tag: "1.0.0"

imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""

serviceAccount:
  create: true
  annotations: {}
  name: ""

podAnnotations: {}

podSecurityContext: {}
  # fsGroup: 2000

securityContext: {}
  # capabilities:
  #   drop:
  #   - ALL
  # readOnlyRootFilesystem: true
  # runAsNonRoot: true
  # runAsUser: 1000

service:
  type: ClusterIP
  port: 8080

ingress:
  enabled: true
  className: "nginx"
  annotations: {}
    # kubernetes.io/ingress.class: nginx
    # cert-manager.io/cluster-issuer: letsencrypt-prod
  hosts:
    - host: api.example.com
      paths:
        - path: /
          pathType: Prefix
  tls: []
  #  - secretName: user-ms-tls
  #    hosts:
  #      - api.example.com

resources:
  limits:
    cpu: 1000m
    memory: 512Mi
  requests:
    cpu: 500m
    memory: 256Mi

autoscaling:
  enabled: true
  minReplicas: 2
  maxReplicas: 10
  targetCPUUtilizationPercentage: 70
  targetMemoryUtilizationPercentage: 80

nodeSelector: {}

tolerations: []

affinity: {}

# 数据库配置
database:
  host: mysql-service
  port: 3306
  name: user_management
  username: app_user
  password: app_password

# Redis 配置
redis:
  host: redis-service
  port: 6379
  password: ""
  database: 0

# 环境变量
env:
  - name: SPRING_PROFILES_ACTIVE
    value: "prod"
  - name: LOG_LEVEL
    value: "info"

# 健康检查
livenessProbe:
  httpGet:
    path: /health/live
    port: 8080
  initialDelaySeconds: 30
  periodSeconds: 10
  timeoutSeconds: 5
  failureThreshold: 3

readinessProbe:
  httpGet:
    path: /health/ready
    port: 8080
  initialDelaySeconds: 10
  periodSeconds: 5
  timeoutSeconds: 3
  failureThreshold: 3

部署命令

# 安装 Helm Chart
helm install user-ms ./user-management-system \
  -f values-prod.yaml \
  --namespace production \
  --create-namespace

# 更新部署
helm upgrade user-ms ./user-management-system \
  -f values-prod.yaml \
  --namespace production

# 回滚
helm rollback user-ms 1 --namespace production

# 卸载
helm uninstall user-ms --namespace production

1.4 传统安装包部署

目录结构

user-management-system-1.0.0/
├── bin/
│   ├── auth-service
│   ├── user-service
│   ├── permission-service
│   └── gateway
├── config/
│   ├── application.yml
│   └── application-prod.yml
├── lib/
│   ├── *.jar
│   └── *.so
├── scripts/
│   ├── install.sh
│   ├── start.sh
│   ├── stop.sh
│   └── restart.sh
└── README.md

安装脚本install.sh

#!/bin/bash

set -e

echo "开始安装用户管理系统..."

# 检查 Java 环境
if ! command -v java &> /dev/null; then
    echo "错误: 未检测到 Java 环境"
    exit 1
fi

# 检查 MySQL
if ! command -v mysql &> /dev/null; then
    echo "错误: 未检测到 MySQL"
    exit 1
fi

# 创建用户
if ! id -u userms &> /dev/null; then
    echo "创建系统用户 userms..."
    useradd -r -s /bin/false userms
fi

# 创建目录
INSTALL_DIR="/opt/user-management-system"
echo "安装目录: $INSTALL_DIR"
mkdir -p $INSTALL_DIR/{bin,config,lib,logs}

# 复制文件
echo "复制文件..."
cp -r bin/* $INSTALL_DIR/bin/
cp -r config/* $INSTALL_DIR/config/
cp -r lib/* $INSTALL_DIR/lib/

# 设置权限
chown -R userms:userms $INSTALL_DIR
chmod +x $INSTALL_DIR/bin/*
chmod +x scripts/*.sh

# 创建服务文件
cat > /etc/systemd/system/user-ms.service <<EOF
[Unit]
Description=User Management System
After=network.target mysql.service

[Service]
Type=forking
User=userms
WorkingDirectory=$INSTALL_DIR
ExecStart=$INSTALL_DIR/scripts/start.sh
ExecStop=$INSTALL_DIR/scripts/stop.sh
Restart=on-failure
RestartSec=10

[Install]
WantedBy=multi-user.target
EOF

# 重载 systemd
systemctl daemon-reload

echo "安装完成!"
echo "请修改配置文件 $INSTALL_DIR/config/application-prod.yml"
echo "启动服务: systemctl start user-ms"
echo "设置开机启动: systemctl enable user-ms"

启动脚本start.sh

#!/bin/bash

INSTALL_DIR="/opt/user-management-system"
LOG_DIR="$INSTALL_DIR/logs"

cd $INSTALL_DIR

echo "启动用户管理系统..."

# 启动认证服务
nohup $INSTALL_DIR/bin/auth-service \
    --spring.config.location=$INSTALL_DIR/config/application-prod.yml \
    > $LOG_DIR/auth-service.log 2>&1 &
AUTH_PID=$!
echo "认证服务启动 (PID: $AUTH_PID)"

# 启动用户服务
nohup $INSTALL_DIR/bin/user-service \
    --spring.config.location=$INSTALL_DIR/config/application-prod.yml \
    > $LOG_DIR/user-service.log 2>&1 &
USER_PID=$!
echo "用户服务启动 (PID: $USER_PID)"

# 启动权限服务
nohup $INSTALL_DIR/bin/permission-service \
    --spring.config.location=$INSTALL_DIR/config/application-prod.yml \
    > $LOG_DIR/permission-service.log 2>&1 &
PERM_PID=$!
echo "权限服务启动 (PID: $PERM_PID)"

# 启动网关
nohup $INSTALL_DIR/bin/gateway \
    --spring.config.location=$INSTALL_DIR/config/application-prod.yml \
    > $LOG_DIR/gateway.log 2>&1 &
GATEWAY_PID=$!
echo "网关启动 (PID: $GATEWAY_PID)"

# 保存 PID
echo $AUTH_PID > $LOG_DIR/auth-service.pid
echo $USER_PID > $LOG_DIR/user-service.pid
echo $PERM_PID > $LOG_DIR/permission-service.pid
echo $GATEWAY_PID > $LOG_DIR/gateway.pid

echo "启动完成!"

2. 监控与告警

2.1 Prometheus 配置

prometheus.yml

global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['alertmanager:9093']

rule_files:
  - "alerts/*.yml"

scrape_configs:
  - job_name: 'user-ms-auth'
    static_configs:
      - targets: ['auth-service:8080']
    metrics_path: '/metrics'

  - job_name: 'user-ms-user'
    static_configs:
      - targets: ['user-service:8080']
    metrics_path: '/metrics'

  - job_name: 'user-ms-permission'
    static_configs:
      - targets: ['permission-service:8080']
    metrics_path: '/metrics'

  - job_name: 'mysql'
    static_configs:
      - targets: ['mysql-exporter:9104']

  - job_name: 'redis'
    static_configs:
      - targets: ['redis-exporter:9121']

告警规则alerts.yml

groups:
  - name: user-ms-alerts
    interval: 30s
    rules:
      # 高错误率告警
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "高错误率告警"
          description: "{{ $labels.instance }} 的错误率超过 5%"

      # 高响应时间告警
      - alert: HighResponseTime
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "高响应时间告警"
          description: "{{ $labels.instance }} 的 P99 响应时间超过 500ms"

      # CPU 使用率告警
      - alert: HighCPUUsage
        expr: rate(process_cpu_seconds_total[5m]) > 0.7
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "高 CPU 使用率"
          description: "{{ $labels.instance }} 的 CPU 使用率超过 70%"

      # 内存使用率告警
      - alert: HighMemoryUsage
        expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "高内存使用率"
          description: "{{ $labels.instance }} 的堆内存使用率超过 80%"

      # 数据库连接告警
      - alert: DatabaseConnectionPoolExhausted
        expr: hikaricp_connections_active / hikaricp_connections_max > 0.9
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "数据库连接池耗尽"
          description: "{{ $labels.instance }} 的数据库连接池使用率超过 90%"

      # 在线用户数异常
      - alert: LowOnlineUsers
        expr: system_online_users < 10
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "在线用户数异常"
          description: "在线用户数低于 10可能存在服务异常"

2.2 Grafana 仪表盘

核心指标面板

面板名称 指标 说明
总用户数 system_total_users 系统总用户数
在线用户数 system_online_users 当前在线用户数
今日注册数 increase(user_register_total[1d]) 今日注册用户数
今日登录数 increase(user_login_total[1d]) 今日登录次数
QPS rate(http_requests_total[1m]) 每秒请求数
响应时间 (P99) histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) P99 响应时间
错误率 rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) 错误率
CPU 使用率 rate(process_cpu_seconds_total[5m]) CPU 使用率
内存使用率 jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} 内存使用率

2.3 日志管理

日志配置Logback

<configuration>
    <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
        <encoder>
            <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
        </encoder>
    </appender>

    <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>logs/application.log</file>
        <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
            <fileNamePattern>logs/application.%d{yyyy-MM-dd}.log</fileNamePattern>
            <maxHistory>30</maxHistory>
            <totalSizeCap>10GB</totalSizeCap>
        </rollingPolicy>
        <encoder>
            <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
        </encoder>
    </appender>

    <root level="INFO">
        <appender-ref ref="CONSOLE" />
        <appender-ref ref="FILE" />
    </root>
</configuration>

ELK 集成

# Filebeat 配置
filebeat.inputs:
- type: log
  enabled: true
  paths:
    - /opt/user-management-system/logs/*.log
  fields:
    app: user-management-system
    env: production

output.elasticsearch:
  hosts: ["elasticsearch:9200"]
  indices:
    - index: "user-ms-%{+yyyy.MM.dd}"
      when.contains:
        app: "user-management-system"

setup.template.name: "user-ms"
setup.template.pattern: "user-ms-*"

3. 运维操作

3.1 日常巡检

巡检清单

检查项 检查方法 正常值 异常处理
服务状态 systemctl status Active 重启服务
磁盘空间 df -h 使用率 < 80% 清理日志
内存使用 free -h 使用率 < 80% 扩容或优化
CPU 使用 top 使用率 < 70% 扩容或优化
数据库连接 SHOW PROCESSLIST 连接数 < 100 优化连接池
Redis 连接 redis-cli info clients 连接数正常 扩容 Redis
API 响应时间 curl -w @curl-format.txt < 500ms 优化代码
错误日志 tail -f error.log 无新错误 排查问题

3.2 备份与恢复

数据库备份

#!/bin/bash
# backup-mysql.sh

BACKUP_DIR="/backup/mysql"
DATE=$(date +%Y%m%d_%H%M%S)
DB_NAME="user_management"
DB_USER="root"
DB_PASSWORD="your_password"

mkdir -p $BACKUP_DIR

# 全量备份
mysqldump -u$DB_USER -p$DB_PASSWORD $DB_NAME | gzip > $BACKUP_DIR/$DB_NAME_$DATE.sql.gz

# 删除 7 天前的备份
find $BACKUP_DIR -name "*.sql.gz" -mtime +7 -delete

echo "备份完成: $BACKUP_DIR/$DB_NAME_$DATE.sql.gz"

数据恢复

# 解压备份文件
gunzip user_management_20260310_120000.sql.gz

# 恢复数据库
mysql -u root -p user_management < user_management_20260310_120000.sql

3.3 版本升级

升级流程

# 1. 备份数据库
./scripts/backup-mysql.sh

# 2. 停止服务
./scripts/stop.sh

# 3. 备份旧版本
cp -r /opt/user-management-system /opt/user-management-system.bak

# 4. 部署新版本
unzip user-management-system-1.1.0.zip -d /opt/

# 5. 执行数据库迁移
mysql -u root -p user_management < migration/1.1.0.sql

# 6. 启动服务
./scripts/start.sh

# 7. 验证服务
curl http://localhost:8080/health
curl http://localhost:8080/health/live
curl http://localhost:8080/health/ready

回滚流程

# 1. 停止服务
./scripts/stop.sh

# 2. 删除新版本
rm -rf /opt/user-management-system

# 3. 恢复旧版本
mv /opt/user-management-system.bak /opt/user-management-system

# 4. 恢复数据库
mysql -u root -p user_management < /backup/mysql/user_management_20260310_120000.sql

# 5. 启动服务
./scripts/start.sh

3.4 故障排查

常见问题

问题 可能原因 排查方法 解决方案
服务启动失败 端口被占用 netstat -tunlp 修改端口或停止占用进程
数据库连接失败 网络问题 ping、telnet 检查网络和防火墙
响应慢 数据库查询慢 慢查询日志 优化 SQL、加索引
内存溢出 内存泄漏 jmap -heap 优化代码、扩容
登录失败 验证码过期 检查 Redis 调整验证码有效期

4. 性能优化

4.1 数据库优化

索引优化

-- 查看慢查询
SHOW VARIABLES LIKE 'slow_query%';
SHOW VARIABLES LIKE 'long_query_time';

-- 分析慢查询
EXPLAIN SELECT * FROM users WHERE username = 'john_doe';

-- 添加索引
CREATE INDEX idx_username ON users(username);
CREATE INDEX idx_email ON users(email);
CREATE INDEX idx_phone ON users(phone);

查询优化

-- 使用覆盖索引
SELECT id, username, email FROM users WHERE status = 1;

-- 避免 SELECT *
SELECT id, username FROM users WHERE id = ?;

-- 使用 LIMIT 分页
SELECT * FROM users ORDER BY id LIMIT 20 OFFSET 0;

4.2 Redis 优化

缓存策略

cache:
  # 用户信息缓存
  user_info:
    ttl: 3600  # 1 小时
    max_size: 10000

  # 权限信息缓存
  user_permissions:
    ttl: 1800  # 30 分钟
    max_size: 5000

  # Token 黑名单
  token_blacklist:
    ttl: 2592000  # 30 天
    max_size: 50000

Redis 配置

# redis.conf
maxmemory 2gb
maxmemory-policy allkeys-lru
save 900 1
save 300 10
save 60 10000

4.3 应用优化

JVM 参数优化

java -jar app.jar \
  -Xms512m \
  -Xmx2g \
  -XX:+UseG1GC \
  -XX:MaxGCPauseMillis=200 \
  -XX:+HeapDumpOnOutOfMemoryError \
  -XX:HeapDumpPath=/opt/logs/heap_dump.hprof

连接池优化

datasource:
  hikari:
    maximum-pool-size: 50
    minimum-idle: 10
    connection-timeout: 30000
    idle-timeout: 600000
    max-lifetime: 1800000

5. 安全加固

5.1 防火墙配置

# 只开放必要端口
firewall-cmd --permanent --add-port=80/tcp
firewall-cmd --permanent --add-port=443/tcp
firewall-cmd --permanent --add-port=22/tcp
firewall-cmd --reload

# 限制数据库访问
firewall-cmd --permanent --add-rich-rule='rule family="ipv4" source address="10.0.0.0/8" port port="3306" protocol="tcp" accept'
firewall-cmd --reload

5.2 SSL/TLS 配置

server {
    listen 443 ssl http2;
    server_name api.example.com;

    ssl_certificate /path/to/cert.pem;
    ssl_certificate_key /path/to/key.pem;

    ssl_protocols TLSv1.2 TLSv1.3;
    ssl_ciphers HIGH:!aNULL:!MD5;
    ssl_prefer_server_ciphers on;

    ssl_session_cache shared:SSL:10m;
    ssl_session_timeout 10m;

    add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
}

6. 监控告警联系人

级别 联系人 通知方式
Critical 运维团队 电话 + 短信 + 邮件
Warning 开发团队 邮件 + 钉钉/企业微信
Info 项目经理 邮件

本文档持续更新中,如有疑问请联系运维团队。