gabriel / musehub public
cloudwatch-alerts.sh bash
228 lines 10.6 KB
Raw
sha256:0997d6250ae6476362f6fe2025af7789f46d03df3e9f34356d5e8ee79b201923 fix(issues): use issue number as pagination cursor, not cre… Sonnet 4.6 patch 8 days ago
1 #!/usr/bin/env bash
2 # MuseHub — CloudWatch log group, metric filters, alarms, and SNS alerting.
3 #
4 # Run once during initial provisioning; safe to re-run (put-metric-alarm is
5 # idempotent, log groups and subscriptions are created only if absent).
6 #
7 # Prerequisites:
8 # * AWS CLI configured with permissions for logs:*, cloudwatch:*, sns:*
9 # * ALERT_EMAIL and ALERT_PHONE set in the environment (or in .env)
10 # * The EC2 instance must have the CloudWatch Agent installed and configured
11 # to emit disk and memory metrics (see deploy/setup-ec2.sh)
12 #
13 # Usage:
14 # bash deploy/cloudwatch-alerts.sh [--region eu-west-1] [--instance-id i-xxx]
15
16 set -euo pipefail
17
18 # ── Config ────────────────────────────────────────────────────────────────────
19
20 AWS_REGION="${AWS_REGION:-eu-west-1}"
21 INSTANCE_ID="${INSTANCE_ID:-}" # EC2 instance ID for disk/memory alarms
22 DB_INSTANCE="${DB_INSTANCE:-musehub}" # RDS identifier (if using RDS)
23 LOG_GROUP="/musehub/app"
24 LOG_RETENTION_DAYS=30 # Hot retention: 30 days in CloudWatch
25 SNS_TOPIC_NAME="musehub-alerts"
26 ALERT_EMAIL="${ALERT_EMAIL:-}" # e.g. [email protected]
27 ALERT_PHONE="${ALERT_PHONE:-}" # E.164 format, e.g. +15551234567
28
29 # Alarm thresholds — aligned with the pre-launch checklist.
30 THRESHOLD_5XX_RATE="1" # percent: alarm when 5xx / total > 1%
31 THRESHOLD_P99_LATENCY_MS="2000" # milliseconds
32 THRESHOLD_DISK_PCT="80" # percent used
33 THRESHOLD_DB_CONN_PCT="90" # percent of max_connections
34
35 # ── Helpers ───────────────────────────────────────────────────────────────────
36
37 log() { echo "[cloudwatch] $*"; }
38 die() { echo "[cloudwatch] ERROR: $*" >&2; exit 1; }
39
40 aws_cmd() {
41 aws --region "$AWS_REGION" "$@"
42 }
43
44 # ── Parse args ────────────────────────────────────────────────────────────────
45
46 while [[ $# -gt 0 ]]; do
47 case "$1" in
48 --region) AWS_REGION="$2"; shift 2 ;;
49 --instance-id) INSTANCE_ID="$2"; shift 2 ;;
50 --db-instance) DB_INSTANCE="$2"; shift 2 ;;
51 *) die "Unknown argument: $1" ;;
52 esac
53 done
54
55 # ── Step 1: Log group + 30-day hot retention ─────────────────────────────────
56
57 log "[1/6] Creating log group $LOG_GROUP (retention: ${LOG_RETENTION_DAYS}d hot)..."
58
59 aws_cmd logs create-log-group \
60 --log-group-name "$LOG_GROUP" 2>/dev/null || true # idempotent
61
62 aws_cmd logs put-retention-policy \
63 --log-group-name "$LOG_GROUP" \
64 --retention-in-days "$LOG_RETENTION_DAYS"
65
66 log "Log group ready. Cold storage (1-year): configure S3 export subscription"
67 log " aws logs put-subscription-filter → Kinesis Firehose → S3 bucket"
68 log " S3 lifecycle: transition to Glacier after 30d, expire after 365d"
69
70 # ── Step 2: SNS topic + subscriptions ────────────────────────────────────────
71
72 log "[2/6] Creating SNS topic $SNS_TOPIC_NAME..."
73
74 SNS_ARN=$(aws_cmd sns create-topic \
75 --name "$SNS_TOPIC_NAME" \
76 --query TopicArn --output text)
77
78 log "SNS topic ARN: $SNS_ARN"
79
80 if [[ -n "$ALERT_EMAIL" ]]; then
81 aws_cmd sns subscribe \
82 --topic-arn "$SNS_ARN" \
83 --protocol email \
84 --notification-endpoint "$ALERT_EMAIL" \
85 --query SubscriptionArn --output text > /dev/null
86 log " Email subscription created for $ALERT_EMAIL (confirm the email)"
87 fi
88
89 if [[ -n "$ALERT_PHONE" ]]; then
90 aws_cmd sns subscribe \
91 --topic-arn "$SNS_ARN" \
92 --protocol sms \
93 --notification-endpoint "$ALERT_PHONE" \
94 --query SubscriptionArn --output text > /dev/null
95 log " SMS subscription created for $ALERT_PHONE"
96 fi
97
98 # ── Step 3: Metric filters (extract from structured JSON logs) ─────────────────
99
100 log "[3/6] Creating metric filters on $LOG_GROUP..."
101
102 # 5xx response count — filter on JSON field $.status >= 500.
103 aws_cmd logs put-metric-filter \
104 --log-group-name "$LOG_GROUP" \
105 --filter-name "musehub-5xx-count" \
106 --filter-pattern '{ $.status >= 500 }' \
107 --metric-transformations \
108 metricName="5xxCount",metricNamespace="MuseHub",metricValue="1",defaultValue="0"
109
110 # Total request count — every record with a $.status field.
111 aws_cmd logs put-metric-filter \
112 --log-group-name "$LOG_GROUP" \
113 --filter-name "musehub-request-count" \
114 --filter-pattern '{ $.status >= 100 }' \
115 --metric-transformations \
116 metricName="RequestCount",metricNamespace="MuseHub",metricValue="1",defaultValue="0"
117
118 # Request duration — extract duration_ms for p99 latency alarm.
119 aws_cmd logs put-metric-filter \
120 --log-group-name "$LOG_GROUP" \
121 --filter-name "musehub-duration-ms" \
122 --filter-pattern '{ $.duration_ms > 0 }' \
123 --metric-transformations \
124 metricName="RequestDurationMs",metricNamespace="MuseHub",metricValue='$.duration_ms',defaultValue="0"
125
126 log "Metric filters created: 5xxCount, RequestCount, RequestDurationMs"
127
128 # ── Step 4: CloudWatch Alarms ─────────────────────────────────────────────────
129
130 log "[4/6] Creating CloudWatch alarms (thresholds: 5xx>${THRESHOLD_5XX_RATE}%, p99>${THRESHOLD_P99_LATENCY_MS}ms, disk>${THRESHOLD_DISK_PCT}%, db-conn>${THRESHOLD_DB_CONN_PCT}%)..."
131
132 # ─ 4a: 5xx error rate > 1% ───────────────────────────────────────────────────
133 # Metric math: 5xxCount / RequestCount * 100 > 1
134 aws_cmd cloudwatch put-metric-alarm \
135 --alarm-name "musehub-5xx-rate-high" \
136 --alarm-description "5xx error rate exceeded ${THRESHOLD_5XX_RATE}% — investigate immediately" \
137 --alarm-actions "$SNS_ARN" \
138 --ok-actions "$SNS_ARN" \
139 --metrics \
140 '[{"Id":"e1","Expression":"(m1/m2)*100","Label":"5xxRate","ReturnData":true},
141 {"Id":"m1","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"5xxCount"},"Period":60,"Stat":"Sum"},"ReturnData":false},
142 {"Id":"m2","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"RequestCount"},"Period":60,"Stat":"Sum"},"ReturnData":false}]' \
143 --comparison-operator GreaterThanThreshold \
144 --threshold "$THRESHOLD_5XX_RATE" \
145 --evaluation-periods 2 \
146 --datapoints-to-alarm 2 \
147 --treat-missing-data notBreaching
148
149 # ─ 4b: p99 request latency > 2 s ─────────────────────────────────────────────
150 aws_cmd cloudwatch put-metric-alarm \
151 --alarm-name "musehub-p99-latency-high" \
152 --alarm-description "p99 request latency exceeded ${THRESHOLD_P99_LATENCY_MS}ms" \
153 --alarm-actions "$SNS_ARN" \
154 --ok-actions "$SNS_ARN" \
155 --namespace MuseHub \
156 --metric-name RequestDurationMs \
157 --statistic "p99" \
158 --period 60 \
159 --evaluation-periods 3 \
160 --datapoints-to-alarm 2 \
161 --threshold "$THRESHOLD_P99_LATENCY_MS" \
162 --comparison-operator GreaterThanThreshold \
163 --treat-missing-data notBreaching \
164 --extended-statistic "p99"
165
166 # ─ 4c: Disk usage > 80% (CloudWatch Agent metric) ────────────────────────────
167 if [[ -n "$INSTANCE_ID" ]]; then
168 aws_cmd cloudwatch put-metric-alarm \
169 --alarm-name "musehub-disk-high" \
170 --alarm-description "Disk usage exceeded ${THRESHOLD_DISK_PCT}% on $INSTANCE_ID" \
171 --alarm-actions "$SNS_ARN" \
172 --ok-actions "$SNS_ARN" \
173 --namespace "CWAgent" \
174 --metric-name "disk_used_percent" \
175 --dimensions "Name=InstanceId,Value=$INSTANCE_ID" "Name=path,Value=/" \
176 --statistic Average \
177 --period 300 \
178 --evaluation-periods 2 \
179 --datapoints-to-alarm 2 \
180 --threshold "$THRESHOLD_DISK_PCT" \
181 --comparison-operator GreaterThanOrEqualToThreshold \
182 --treat-missing-data missing
183 else
184 log " SKIPPING disk alarm: INSTANCE_ID not set (pass --instance-id i-xxx)"
185 fi
186
187 # ─ 4d: DB connections > 90% of max ──────────────────────────────────────────
188 # For self-hosted PostgreSQL on EC2, the CloudWatch Agent publishes
189 # postgresql_numconnections via the procstat plugin.
190 # For RDS, use AWS/RDS DatabaseConnections / DBParameterGroup max_connections.
191 #
192 # This alarm uses the RDS metric; adjust the namespace/metric if using
193 # the CloudWatch Agent with the postgresql procstat plugin.
194 aws_cmd cloudwatch put-metric-alarm \
195 --alarm-name "musehub-db-connections-high" \
196 --alarm-description "PostgreSQL connections exceeded ${THRESHOLD_DB_CONN_PCT}% of max_connections" \
197 --alarm-actions "$SNS_ARN" \
198 --ok-actions "$SNS_ARN" \
199 --namespace "AWS/RDS" \
200 --metric-name "DatabaseConnections" \
201 --dimensions "Name=DBInstanceIdentifier,Value=$DB_INSTANCE" \
202 --statistic Average \
203 --period 60 \
204 --evaluation-periods 3 \
205 --datapoints-to-alarm 2 \
206 --threshold 90 \
207 --comparison-operator GreaterThanOrEqualToThreshold \
208 --treat-missing-data missing
209
210 log "Alarms created: musehub-5xx-rate-high, musehub-p99-latency-high, musehub-disk-high, musehub-db-connections-high"
211
212 # ── Step 5: EC2 CloudWatch Agent config hint ──────────────────────────────────
213
214 log "[5/6] CloudWatch Agent note:"
215 log " Install: sudo yum install -y amazon-cloudwatch-agent"
216 log " Configure disk + memory metrics in /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json"
217 log " Use log group: $LOG_GROUP"
218 log " Start: sudo systemctl start amazon-cloudwatch-agent"
219
220 # ── Step 6: Summary ───────────────────────────────────────────────────────────
221
222 log "[6/6] Done."
223 log ""
224 log " Log group : $LOG_GROUP (${LOG_RETENTION_DAYS}d hot; configure S3 export for 1-year cold)"
225 log " SNS topic : $SNS_ARN"
226 log " Alarms : 5xx rate > ${THRESHOLD_5XX_RATE}% | p99 > ${THRESHOLD_P99_LATENCY_MS}ms | disk > ${THRESHOLD_DISK_PCT}% | DB conns > ${THRESHOLD_DB_CONN_PCT}%"
227 log ""
228 log " Verify with: aws cloudwatch describe-alarms --alarm-names musehub-5xx-rate-high musehub-p99-latency-high --region $AWS_REGION"
File History 1 commit
sha256:0997d6250ae6476362f6fe2025af7789f46d03df3e9f34356d5e8ee79b201923 fix(issues): use issue number as pagination cursor, not cre… Sonnet 4.6 patch 8 days ago