Shell 脚本自动化运维:从入门到实用

Shell 脚本是运维工程师的瑞士军刀,本文整理最实用的脚本技巧和模板。

脚本规范

#!/usr/bin/env bash
set -euo pipefail   # 遇错退出、未定义变量报错、管道失败退出

# 脚本描述
# 用法: ./script.sh [参数]

readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly LOG_FILE="/var/log/ops/$(date +%Y%m%d).log"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
die() { log "ERROR: $*"; exit 1; }

磁盘使用率告警

#!/usr/bin/env bash
set -euo pipefail

THRESHOLD=85
WEBHOOK="${FEISHU_WEBHOOK:-}"

check_disk() {
    while IFS= read -r line; do
        usage=$(echo "$line" | awk '{print $5}' | tr -d '%')
        mount=$(echo "$line" | awk '{print $6}')
        host=$(hostname)

        if [[ $usage -gt $THRESHOLD ]]; then
            msg="⚠️ 磁盘告警\n主机: ${host}\n挂载点: ${mount}\n使用率: ${usage}%"
            if [[ -n "$WEBHOOK" ]]; then
                curl -s -X POST "$WEBHOOK" \
                    -H 'Content-Type: application/json' \
                    -d "{\"msg_type\":\"text\",\"content\":{\"text\":\"${msg}\"}}"
            fi
            echo "WARN: ${mount} usage ${usage}%"
        fi
    done < <(df -h | grep -vE '^(Filesystem|tmpfs|udev)')
}

check_disk

批量 SSH 执行命令

#!/usr/bin/env bash
set -euo pipefail

HOSTS_FILE="${1:-hosts.txt}"
COMMAND="${2:-uptime}"
MAX_PARALLEL=10

run_on_host() {
    local host="$1"
    local result
    if result=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
                    "$host" "$COMMAND" 2>&1); then
        echo "[OK] ${host}: ${result}"
    else
        echo "[FAIL] ${host}: ${result}"
    fi
}

export -f run_on_host
export COMMAND

# 并行执行
xargs -P "$MAX_PARALLEL" -I{} bash -c 'run_on_host "$@"' _ {} \
    < "$HOSTS_FILE"

日志分析脚本

#!/usr/bin/env bash
# 分析 Nginx 访问日志,输出 Top 10 IP 和 URL

LOG_FILE="${1:-/var/log/nginx/access.log}"
LINES="${2:-100000}"   # 分析最后 N 行

echo "=== Top 10 访问 IP ==="
tail -n "$LINES" "$LOG_FILE" \
    | awk '{print $1}' \
    | sort | uniq -c | sort -rn \
    | head -10

echo ""
echo "=== Top 10 访问 URL ==="
tail -n "$LINES" "$LOG_FILE" \
    | awk '{print $7}' \
    | cut -d'?' -f1 \
    | sort | uniq -c | sort -rn \
    | head -10

echo ""
echo "=== HTTP 状态码分布 ==="
tail -n "$LINES" "$LOG_FILE" \
    | awk '{print $9}' \
    | sort | uniq -c | sort -rn

echo ""
echo "=== 最近 5xx 错误 ==="
grep ' 5[0-9][0-9] ' "$LOG_FILE" | tail -20

服务健康检查

#!/usr/bin/env bash
set -euo pipefail

services=(
    "nginx:80"
    "mysql:3306"
    "redis:6379"
)

check_port() {
    local service="${1%%:*}"
    local port="${1##*:}"
    if nc -z -w3 localhost "$port" 2>/dev/null; then
        echo "✓ ${service}:${port} 正常"
    else
        echo "✗ ${service}:${port} 异常!"
        return 1
    fi
}

failed=0
for svc in "${services[@]}"; do
    check_port "$svc" || ((failed++))
done

[[ $failed -eq 0 ]] && echo "所有服务正常" || echo "${failed} 个服务异常"
exit $failed

自动备份脚本

#!/usr/bin/env bash
set -euo pipefail

DB_NAME="${1:?请指定数据库名}"
BACKUP_DIR="/backup/mysql"
RETENTION_DAYS=7
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/${DB_NAME}_${DATE}.sql.gz"

mkdir -p "$BACKUP_DIR"

# 备份
mysqldump --single-transaction --quick \
    -u root -p"${MYSQL_PASSWORD}" \
    "$DB_NAME" | gzip > "$BACKUP_FILE"

echo "备份完成: ${BACKUP_FILE} ($(du -sh "$BACKUP_FILE" | cut -f1))"

# 清理旧备份
find "$BACKUP_DIR" -name "*.sql.gz" \
    -mtime "+${RETENTION_DAYS}" -delete

echo "已清理 ${RETENTION_DAYS} 天前的备份"

定时任务管理

# 查看当前 crontab
crontab -l

# 编辑 crontab
crontab -e

# 常用 cron 表达式
# 每天凌晨 2 点备份
0 2 * * * /opt/scripts/backup.sh >> /var/log/backup.log 2>&1

# 每 5 分钟检查服务
*/5 * * * * /opt/scripts/health-check.sh

# 每周一早上 9 点发报告
0 9 * * 1 /opt/scripts/weekly-report.sh

好的 Shell 脚本应该像好的代码一样:有注释、有错误处理、可复用、易维护。

← 返回文章列表