Shell 脚本自动化运维:从入门到实用
Shell 脚本是运维工程师的瑞士军刀,本文整理最实用的脚本技巧和模板。
脚本规范
#!/usr/bin/env bash
set -euo pipefail # 遇错退出、未定义变量报错、管道失败退出
# 脚本描述
# 用法: ./script.sh [参数]
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly LOG_FILE="/var/log/ops/$(date +%Y%m%d).log"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
die() { log "ERROR: $*"; exit 1; }
磁盘使用率告警
#!/usr/bin/env bash
set -euo pipefail
THRESHOLD=85
WEBHOOK="${FEISHU_WEBHOOK:-}"
check_disk() {
while IFS= read -r line; do
usage=$(echo "$line" | awk '{print $5}' | tr -d '%')
mount=$(echo "$line" | awk '{print $6}')
host=$(hostname)
if [[ $usage -gt $THRESHOLD ]]; then
msg="⚠️ 磁盘告警\n主机: ${host}\n挂载点: ${mount}\n使用率: ${usage}%"
if [[ -n "$WEBHOOK" ]]; then
curl -s -X POST "$WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{\"msg_type\":\"text\",\"content\":{\"text\":\"${msg}\"}}"
fi
echo "WARN: ${mount} usage ${usage}%"
fi
done < <(df -h | grep -vE '^(Filesystem|tmpfs|udev)')
}
check_disk
批量 SSH 执行命令
#!/usr/bin/env bash
set -euo pipefail
HOSTS_FILE="${1:-hosts.txt}"
COMMAND="${2:-uptime}"
MAX_PARALLEL=10
run_on_host() {
local host="$1"
local result
if result=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
"$host" "$COMMAND" 2>&1); then
echo "[OK] ${host}: ${result}"
else
echo "[FAIL] ${host}: ${result}"
fi
}
export -f run_on_host
export COMMAND
# 并行执行
xargs -P "$MAX_PARALLEL" -I{} bash -c 'run_on_host "$@"' _ {} \
< "$HOSTS_FILE"
日志分析脚本
#!/usr/bin/env bash
# 分析 Nginx 访问日志,输出 Top 10 IP 和 URL
LOG_FILE="${1:-/var/log/nginx/access.log}"
LINES="${2:-100000}" # 分析最后 N 行
echo "=== Top 10 访问 IP ==="
tail -n "$LINES" "$LOG_FILE" \
| awk '{print $1}' \
| sort | uniq -c | sort -rn \
| head -10
echo ""
echo "=== Top 10 访问 URL ==="
tail -n "$LINES" "$LOG_FILE" \
| awk '{print $7}' \
| cut -d'?' -f1 \
| sort | uniq -c | sort -rn \
| head -10
echo ""
echo "=== HTTP 状态码分布 ==="
tail -n "$LINES" "$LOG_FILE" \
| awk '{print $9}' \
| sort | uniq -c | sort -rn
echo ""
echo "=== 最近 5xx 错误 ==="
grep ' 5[0-9][0-9] ' "$LOG_FILE" | tail -20
服务健康检查
#!/usr/bin/env bash
set -euo pipefail
services=(
"nginx:80"
"mysql:3306"
"redis:6379"
)
check_port() {
local service="${1%%:*}"
local port="${1##*:}"
if nc -z -w3 localhost "$port" 2>/dev/null; then
echo "✓ ${service}:${port} 正常"
else
echo "✗ ${service}:${port} 异常!"
return 1
fi
}
failed=0
for svc in "${services[@]}"; do
check_port "$svc" || ((failed++))
done
[[ $failed -eq 0 ]] && echo "所有服务正常" || echo "${failed} 个服务异常"
exit $failed
自动备份脚本
#!/usr/bin/env bash
set -euo pipefail
DB_NAME="${1:?请指定数据库名}"
BACKUP_DIR="/backup/mysql"
RETENTION_DAYS=7
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/${DB_NAME}_${DATE}.sql.gz"
mkdir -p "$BACKUP_DIR"
# 备份
mysqldump --single-transaction --quick \
-u root -p"${MYSQL_PASSWORD}" \
"$DB_NAME" | gzip > "$BACKUP_FILE"
echo "备份完成: ${BACKUP_FILE} ($(du -sh "$BACKUP_FILE" | cut -f1))"
# 清理旧备份
find "$BACKUP_DIR" -name "*.sql.gz" \
-mtime "+${RETENTION_DAYS}" -delete
echo "已清理 ${RETENTION_DAYS} 天前的备份"
定时任务管理
# 查看当前 crontab
crontab -l
# 编辑 crontab
crontab -e
# 常用 cron 表达式
# 每天凌晨 2 点备份
0 2 * * * /opt/scripts/backup.sh >> /var/log/backup.log 2>&1
# 每 5 分钟检查服务
*/5 * * * * /opt/scripts/health-check.sh
# 每周一早上 9 点发报告
0 9 * * 1 /opt/scripts/weekly-report.sh
好的 Shell 脚本应该像好的代码一样:有注释、有错误处理、可复用、易维护。