deploy/cloudwatch-alerts.sh · gabriel/musehub

log "[4/6] Creating CloudWatch alarms (thresholds: 5xx>${THRESHOLD_5XX_RATE}%, p99>${THRESHOLD_P99_LATENCY_MS}ms, disk>${THRESHOLD_DISK_PCT}%, db-conn>${THRESHOLD_DB_CONN_PCT}%)..."

131

132

# ─ 4a: 5xx error rate > 1% ───────────────────────────────────────────────────

133

# Metric math: 5xxCount / RequestCount * 100 > 1

134

aws_cmd cloudwatch put-metric-alarm \

135

--alarm-name "musehub-5xx-rate-high" \

136

--alarm-description "5xx error rate exceeded ${THRESHOLD_5XX_RATE}% — investigate immediately" \

137

--alarm-actions "$SNS_ARN" \

138

--ok-actions "$SNS_ARN" \

139

--metrics \

140

'[{"Id":"e1","Expression":"(m1/m2)*100","Label":"5xxRate","ReturnData":true},

141

{"Id":"m1","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"5xxCount"},"Period":60,"Stat":"Sum"},"ReturnData":false},

142

{"Id":"m2","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"RequestCount"},"Period":60,"Stat":"Sum"},"ReturnData":false}]' \

143

--comparison-operator GreaterThanThreshold \

144

--threshold "$THRESHOLD_5XX_RATE" \

145

--evaluation-periods 2 \

146

--datapoints-to-alarm 2 \

147

--treat-missing-data notBreaching

148

149

# ─ 4b: p99 request latency > 2 s ─────────────────────────────────────────────

150

aws_cmd cloudwatch put-metric-alarm \

151

--alarm-name "musehub-p99-latency-high" \

152

--alarm-description "p99 request latency exceeded ${THRESHOLD_P99_LATENCY_MS}ms" \

153

--alarm-actions "$SNS_ARN" \

154

--ok-actions "$SNS_ARN" \

155

--namespace MuseHub \

156

--metric-name RequestDurationMs \

157

--statistic "p99" \

158

--period 60 \

159

--evaluation-periods 3 \

160

--datapoints-to-alarm 2 \

161

--threshold "$THRESHOLD_P99_LATENCY_MS" \

162

--comparison-operator GreaterThanThreshold \

163

--treat-missing-data notBreaching \

164

--extended-statistic "p99"

165

166

# ─ 4c: Disk usage > 80% (CloudWatch Agent metric) ────────────────────────────

167

if [[ -n "$INSTANCE_ID" ]]; then

168

aws_cmd cloudwatch put-metric-alarm \

169

--alarm-name "musehub-disk-high" \

170

--alarm-description "Disk usage exceeded ${THRESHOLD_DISK_PCT}% on $INSTANCE_ID" \

171

--alarm-actions "$SNS_ARN" \

172

--ok-actions "$SNS_ARN" \

173

--namespace "CWAgent" \

174

--metric-name "disk_used_percent" \

175

--dimensions "Name=InstanceId,Value=$INSTANCE_ID" "Name=path,Value=/" \

176

--statistic Average \

177

--period 300 \

178

--evaluation-periods 2 \

179

--datapoints-to-alarm 2 \

180

--threshold "$THRESHOLD_DISK_PCT" \

181

--comparison-operator GreaterThanOrEqualToThreshold \

182

--treat-missing-data missing

183

else

184

log " SKIPPING disk alarm: INSTANCE_ID not set (pass --instance-id i-xxx)"

185

fi

186

187

# ─ 4d: DB connections > 90% of max ──────────────────────────────────────────

188

# For self-hosted PostgreSQL on EC2, the CloudWatch Agent publishes

189

# postgresql_numconnections via the procstat plugin.

190

# For RDS, use AWS/RDS DatabaseConnections / DBParameterGroup max_connections.

191

#

192

# This alarm uses the RDS metric; adjust the namespace/metric if using

193

# the CloudWatch Agent with the postgresql procstat plugin.

194

aws_cmd cloudwatch put-metric-alarm \

195

--alarm-name "musehub-db-connections-high" \

196

--alarm-description "PostgreSQL connections exceeded ${THRESHOLD_DB_CONN_PCT}% of max_connections" \

197

--alarm-actions "$SNS_ARN" \

198

--ok-actions "$SNS_ARN" \

199

--namespace "AWS/RDS" \

200

--metric-name "DatabaseConnections" \

201

--dimensions "Name=DBInstanceIdentifier,Value=$DB_INSTANCE" \

202

--statistic Average \

203

--period 60 \

204

--evaluation-periods 3 \

205

--datapoints-to-alarm 2 \

206

--threshold 90 \

207

--comparison-operator GreaterThanOrEqualToThreshold \

208

--treat-missing-data missing

209

210

log "Alarms created: musehub-5xx-rate-high, musehub-p99-latency-high, musehub-disk-high, musehub-db-connections-high"

211

212

# ── Step 5: EC2 CloudWatch Agent config hint ──────────────────────────────────

213

214

log "[5/6] CloudWatch Agent note:"

215

log " Install: sudo yum install -y amazon-cloudwatch-agent"

216

log " Configure disk + memory metrics in /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json"

217

log " Use log group: $LOG_GROUP"

218

log " Start: sudo systemctl start amazon-cloudwatch-agent"

219

220

# ── Step 6: Summary ───────────────────────────────────────────────────────────

log "[6/6] Done."

log ""

log " Log group : $LOG_GROUP (${LOG_RETENTION_DAYS}d hot; configure S3 export for 1-year cold)"

225

log " SNS topic : $SNS_ARN"

226

log " Alarms : 5xx rate > ${THRESHOLD_5XX_RATE}% | p99 > ${THRESHOLD_P99_LATENCY_MS}ms | disk > ${THRESHOLD_DISK_PCT}% | DB conns > ${THRESHOLD_DB_CONN_PCT}%"

227

log ""

228

log " Verify with: aws cloudwatch describe-alarms --alarm-names musehub-5xx-rate-high musehub-p99-latency-high --region $AWS_REGION"