#!/usr/bin/env bash # MuseHub — CloudWatch log group, metric filters, alarms, and SNS alerting. # # Run once during initial provisioning; safe to re-run (put-metric-alarm is # idempotent, log groups and subscriptions are created only if absent). # # Prerequisites: # * AWS CLI configured with permissions for logs:*, cloudwatch:*, sns:* # * ALERT_EMAIL and ALERT_PHONE set in the environment (or in .env) # * The EC2 instance must have the CloudWatch Agent installed and configured # to emit disk and memory metrics (see deploy/setup-ec2.sh) # # Usage: # bash deploy/cloudwatch-alerts.sh [--region eu-west-1] [--instance-id i-xxx] set -euo pipefail # ── Config ──────────────────────────────────────────────────────────────────── AWS_REGION="${AWS_REGION:-eu-west-1}" INSTANCE_ID="${INSTANCE_ID:-}" # EC2 instance ID for disk/memory alarms DB_INSTANCE="${DB_INSTANCE:-musehub}" # RDS identifier (if using RDS) LOG_GROUP="/musehub/app" LOG_RETENTION_DAYS=30 # Hot retention: 30 days in CloudWatch SNS_TOPIC_NAME="musehub-alerts" ALERT_EMAIL="${ALERT_EMAIL:-}" # e.g. gabriel@musehub.ai ALERT_PHONE="${ALERT_PHONE:-}" # E.164 format, e.g. +15551234567 # Alarm thresholds — aligned with the pre-launch checklist. THRESHOLD_5XX_RATE="1" # percent: alarm when 5xx / total > 1% THRESHOLD_P99_LATENCY_MS="2000" # milliseconds THRESHOLD_DISK_PCT="80" # percent used THRESHOLD_DB_CONN_PCT="90" # percent of max_connections # ── Helpers ─────────────────────────────────────────────────────────────────── log() { echo "[cloudwatch] $*"; } die() { echo "[cloudwatch] ERROR: $*" >&2; exit 1; } aws_cmd() { aws --region "$AWS_REGION" "$@" } # ── Parse args ──────────────────────────────────────────────────────────────── while [[ $# -gt 0 ]]; do case "$1" in --region) AWS_REGION="$2"; shift 2 ;; --instance-id) INSTANCE_ID="$2"; shift 2 ;; --db-instance) DB_INSTANCE="$2"; shift 2 ;; *) die "Unknown argument: $1" ;; esac done # ── Step 1: Log group + 30-day hot retention ───────────────────────────────── log "[1/6] Creating log group $LOG_GROUP (retention: ${LOG_RETENTION_DAYS}d hot)..." aws_cmd logs create-log-group \ --log-group-name "$LOG_GROUP" 2>/dev/null || true # idempotent aws_cmd logs put-retention-policy \ --log-group-name "$LOG_GROUP" \ --retention-in-days "$LOG_RETENTION_DAYS" log "Log group ready. Cold storage (1-year): configure S3 export subscription" log " aws logs put-subscription-filter → Kinesis Firehose → S3 bucket" log " S3 lifecycle: transition to Glacier after 30d, expire after 365d" # ── Step 2: SNS topic + subscriptions ──────────────────────────────────────── log "[2/6] Creating SNS topic $SNS_TOPIC_NAME..." SNS_ARN=$(aws_cmd sns create-topic \ --name "$SNS_TOPIC_NAME" \ --query TopicArn --output text) log "SNS topic ARN: $SNS_ARN" if [[ -n "$ALERT_EMAIL" ]]; then aws_cmd sns subscribe \ --topic-arn "$SNS_ARN" \ --protocol email \ --notification-endpoint "$ALERT_EMAIL" \ --query SubscriptionArn --output text > /dev/null log " Email subscription created for $ALERT_EMAIL (confirm the email)" fi if [[ -n "$ALERT_PHONE" ]]; then aws_cmd sns subscribe \ --topic-arn "$SNS_ARN" \ --protocol sms \ --notification-endpoint "$ALERT_PHONE" \ --query SubscriptionArn --output text > /dev/null log " SMS subscription created for $ALERT_PHONE" fi # ── Step 3: Metric filters (extract from structured JSON logs) ───────────────── log "[3/6] Creating metric filters on $LOG_GROUP..." # 5xx response count — filter on JSON field $.status >= 500. aws_cmd logs put-metric-filter \ --log-group-name "$LOG_GROUP" \ --filter-name "musehub-5xx-count" \ --filter-pattern '{ $.status >= 500 }' \ --metric-transformations \ metricName="5xxCount",metricNamespace="MuseHub",metricValue="1",defaultValue="0" # Total request count — every record with a $.status field. aws_cmd logs put-metric-filter \ --log-group-name "$LOG_GROUP" \ --filter-name "musehub-request-count" \ --filter-pattern '{ $.status >= 100 }' \ --metric-transformations \ metricName="RequestCount",metricNamespace="MuseHub",metricValue="1",defaultValue="0" # Request duration — extract duration_ms for p99 latency alarm. aws_cmd logs put-metric-filter \ --log-group-name "$LOG_GROUP" \ --filter-name "musehub-duration-ms" \ --filter-pattern '{ $.duration_ms > 0 }' \ --metric-transformations \ metricName="RequestDurationMs",metricNamespace="MuseHub",metricValue='$.duration_ms',defaultValue="0" log "Metric filters created: 5xxCount, RequestCount, RequestDurationMs" # ── Step 4: CloudWatch Alarms ───────────────────────────────────────────────── log "[4/6] Creating CloudWatch alarms (thresholds: 5xx>${THRESHOLD_5XX_RATE}%, p99>${THRESHOLD_P99_LATENCY_MS}ms, disk>${THRESHOLD_DISK_PCT}%, db-conn>${THRESHOLD_DB_CONN_PCT}%)..." # ─ 4a: 5xx error rate > 1% ─────────────────────────────────────────────────── # Metric math: 5xxCount / RequestCount * 100 > 1 aws_cmd cloudwatch put-metric-alarm \ --alarm-name "musehub-5xx-rate-high" \ --alarm-description "5xx error rate exceeded ${THRESHOLD_5XX_RATE}% — investigate immediately" \ --alarm-actions "$SNS_ARN" \ --ok-actions "$SNS_ARN" \ --metrics \ '[{"Id":"e1","Expression":"(m1/m2)*100","Label":"5xxRate","ReturnData":true}, {"Id":"m1","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"5xxCount"},"Period":60,"Stat":"Sum"},"ReturnData":false}, {"Id":"m2","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"RequestCount"},"Period":60,"Stat":"Sum"},"ReturnData":false}]' \ --comparison-operator GreaterThanThreshold \ --threshold "$THRESHOLD_5XX_RATE" \ --evaluation-periods 2 \ --datapoints-to-alarm 2 \ --treat-missing-data notBreaching # ─ 4b: p99 request latency > 2 s ───────────────────────────────────────────── aws_cmd cloudwatch put-metric-alarm \ --alarm-name "musehub-p99-latency-high" \ --alarm-description "p99 request latency exceeded ${THRESHOLD_P99_LATENCY_MS}ms" \ --alarm-actions "$SNS_ARN" \ --ok-actions "$SNS_ARN" \ --namespace MuseHub \ --metric-name RequestDurationMs \ --statistic "p99" \ --period 60 \ --evaluation-periods 3 \ --datapoints-to-alarm 2 \ --threshold "$THRESHOLD_P99_LATENCY_MS" \ --comparison-operator GreaterThanThreshold \ --treat-missing-data notBreaching \ --extended-statistic "p99" # ─ 4c: Disk usage > 80% (CloudWatch Agent metric) ──────────────────────────── if [[ -n "$INSTANCE_ID" ]]; then aws_cmd cloudwatch put-metric-alarm \ --alarm-name "musehub-disk-high" \ --alarm-description "Disk usage exceeded ${THRESHOLD_DISK_PCT}% on $INSTANCE_ID" \ --alarm-actions "$SNS_ARN" \ --ok-actions "$SNS_ARN" \ --namespace "CWAgent" \ --metric-name "disk_used_percent" \ --dimensions "Name=InstanceId,Value=$INSTANCE_ID" "Name=path,Value=/" \ --statistic Average \ --period 300 \ --evaluation-periods 2 \ --datapoints-to-alarm 2 \ --threshold "$THRESHOLD_DISK_PCT" \ --comparison-operator GreaterThanOrEqualToThreshold \ --treat-missing-data missing else log " SKIPPING disk alarm: INSTANCE_ID not set (pass --instance-id i-xxx)" fi # ─ 4d: DB connections > 90% of max ────────────────────────────────────────── # For self-hosted PostgreSQL on EC2, the CloudWatch Agent publishes # postgresql_numconnections via the procstat plugin. # For RDS, use AWS/RDS DatabaseConnections / DBParameterGroup max_connections. # # This alarm uses the RDS metric; adjust the namespace/metric if using # the CloudWatch Agent with the postgresql procstat plugin. aws_cmd cloudwatch put-metric-alarm \ --alarm-name "musehub-db-connections-high" \ --alarm-description "PostgreSQL connections exceeded ${THRESHOLD_DB_CONN_PCT}% of max_connections" \ --alarm-actions "$SNS_ARN" \ --ok-actions "$SNS_ARN" \ --namespace "AWS/RDS" \ --metric-name "DatabaseConnections" \ --dimensions "Name=DBInstanceIdentifier,Value=$DB_INSTANCE" \ --statistic Average \ --period 60 \ --evaluation-periods 3 \ --datapoints-to-alarm 2 \ --threshold 90 \ --comparison-operator GreaterThanOrEqualToThreshold \ --treat-missing-data missing log "Alarms created: musehub-5xx-rate-high, musehub-p99-latency-high, musehub-disk-high, musehub-db-connections-high" # ── Step 5: EC2 CloudWatch Agent config hint ────────────────────────────────── log "[5/6] CloudWatch Agent note:" log " Install: sudo yum install -y amazon-cloudwatch-agent" log " Configure disk + memory metrics in /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json" log " Use log group: $LOG_GROUP" log " Start: sudo systemctl start amazon-cloudwatch-agent" # ── Step 6: Summary ─────────────────────────────────────────────────────────── log "[6/6] Done." log "" log " Log group : $LOG_GROUP (${LOG_RETENTION_DAYS}d hot; configure S3 export for 1-year cold)" log " SNS topic : $SNS_ARN" log " Alarms : 5xx rate > ${THRESHOLD_5XX_RATE}% | p99 > ${THRESHOLD_P99_LATENCY_MS}ms | disk > ${THRESHOLD_DISK_PCT}% | DB conns > ${THRESHOLD_DB_CONN_PCT}%" log "" log " Verify with: aws cloudwatch describe-alarms --alarm-names musehub-5xx-rate-high musehub-p99-latency-high --region $AWS_REGION"