#!/usr/bin/env bash
# MuseHub — CloudWatch log group, metric filters, alarms, and SNS alerting.
#
# Run once during initial provisioning; safe to re-run (put-metric-alarm is
# idempotent, log groups and subscriptions are created only if absent).
#
# Prerequisites:
#   * AWS CLI configured with permissions for logs:*, cloudwatch:*, sns:*
#   * ALERT_EMAIL and ALERT_PHONE set in the environment (or in .env)
#   * The EC2 instance must have the CloudWatch Agent installed and configured
#     to emit disk and memory metrics (see deploy/setup-ec2.sh)
#
# Usage:
#   bash deploy/cloudwatch-alerts.sh [--region eu-west-1] [--instance-id i-xxx]

set -euo pipefail

# ── Config ────────────────────────────────────────────────────────────────────

AWS_REGION="${AWS_REGION:-eu-west-1}"
INSTANCE_ID="${INSTANCE_ID:-}"          # EC2 instance ID for disk/memory alarms
DB_INSTANCE="${DB_INSTANCE:-musehub}"   # RDS identifier (if using RDS)
LOG_GROUP="/musehub/app"
LOG_RETENTION_DAYS=30                   # Hot retention: 30 days in CloudWatch
SNS_TOPIC_NAME="musehub-alerts"
ALERT_EMAIL="${ALERT_EMAIL:-}"          # e.g. gabriel@musehub.ai
ALERT_PHONE="${ALERT_PHONE:-}"          # E.164 format, e.g. +15551234567

# Alarm thresholds — aligned with the pre-launch checklist.
THRESHOLD_5XX_RATE="1"          # percent: alarm when 5xx / total > 1%
THRESHOLD_P99_LATENCY_MS="2000" # milliseconds
THRESHOLD_DISK_PCT="80"         # percent used
THRESHOLD_DB_CONN_PCT="90"      # percent of max_connections

# ── Helpers ───────────────────────────────────────────────────────────────────

log()  { echo "[cloudwatch] $*"; }
die()  { echo "[cloudwatch] ERROR: $*" >&2; exit 1; }

aws_cmd() {
    aws --region "$AWS_REGION" "$@"
}

# ── Parse args ────────────────────────────────────────────────────────────────

while [[ $# -gt 0 ]]; do
    case "$1" in
        --region)      AWS_REGION="$2";     shift 2 ;;
        --instance-id) INSTANCE_ID="$2";    shift 2 ;;
        --db-instance) DB_INSTANCE="$2";    shift 2 ;;
        *) die "Unknown argument: $1" ;;
    esac
done

# ── Step 1: Log group + 30-day hot retention ─────────────────────────────────

log "[1/6] Creating log group $LOG_GROUP (retention: ${LOG_RETENTION_DAYS}d hot)..."

aws_cmd logs create-log-group \
    --log-group-name "$LOG_GROUP" 2>/dev/null || true   # idempotent

aws_cmd logs put-retention-policy \
    --log-group-name "$LOG_GROUP" \
    --retention-in-days "$LOG_RETENTION_DAYS"

log "Log group ready. Cold storage (1-year): configure S3 export subscription"
log "  aws logs put-subscription-filter → Kinesis Firehose → S3 bucket"
log "  S3 lifecycle: transition to Glacier after 30d, expire after 365d"

# ── Step 2: SNS topic + subscriptions ────────────────────────────────────────

log "[2/6] Creating SNS topic $SNS_TOPIC_NAME..."

SNS_ARN=$(aws_cmd sns create-topic \
    --name "$SNS_TOPIC_NAME" \
    --query TopicArn --output text)

log "SNS topic ARN: $SNS_ARN"

if [[ -n "$ALERT_EMAIL" ]]; then
    aws_cmd sns subscribe \
        --topic-arn "$SNS_ARN" \
        --protocol email \
        --notification-endpoint "$ALERT_EMAIL" \
        --query SubscriptionArn --output text > /dev/null
    log "  Email subscription created for $ALERT_EMAIL (confirm the email)"
fi

if [[ -n "$ALERT_PHONE" ]]; then
    aws_cmd sns subscribe \
        --topic-arn "$SNS_ARN" \
        --protocol sms \
        --notification-endpoint "$ALERT_PHONE" \
        --query SubscriptionArn --output text > /dev/null
    log "  SMS subscription created for $ALERT_PHONE"
fi

# ── Step 3: Metric filters (extract from structured JSON logs) ─────────────────

log "[3/6] Creating metric filters on $LOG_GROUP..."

# 5xx response count — filter on JSON field $.status >= 500.
aws_cmd logs put-metric-filter \
    --log-group-name "$LOG_GROUP" \
    --filter-name "musehub-5xx-count" \
    --filter-pattern '{ $.status >= 500 }' \
    --metric-transformations \
        metricName="5xxCount",metricNamespace="MuseHub",metricValue="1",defaultValue="0"

# Total request count — every record with a $.status field.
aws_cmd logs put-metric-filter \
    --log-group-name "$LOG_GROUP" \
    --filter-name "musehub-request-count" \
    --filter-pattern '{ $.status >= 100 }' \
    --metric-transformations \
        metricName="RequestCount",metricNamespace="MuseHub",metricValue="1",defaultValue="0"

# Request duration — extract duration_ms for p99 latency alarm.
aws_cmd logs put-metric-filter \
    --log-group-name "$LOG_GROUP" \
    --filter-name "musehub-duration-ms" \
    --filter-pattern '{ $.duration_ms > 0 }' \
    --metric-transformations \
        metricName="RequestDurationMs",metricNamespace="MuseHub",metricValue='$.duration_ms',defaultValue="0"

log "Metric filters created: 5xxCount, RequestCount, RequestDurationMs"

# ── Step 4: CloudWatch Alarms ─────────────────────────────────────────────────

log "[4/6] Creating CloudWatch alarms (thresholds: 5xx>${THRESHOLD_5XX_RATE}%, p99>${THRESHOLD_P99_LATENCY_MS}ms, disk>${THRESHOLD_DISK_PCT}%, db-conn>${THRESHOLD_DB_CONN_PCT}%)..."

# ─ 4a: 5xx error rate > 1% ───────────────────────────────────────────────────
# Metric math: 5xxCount / RequestCount * 100 > 1
aws_cmd cloudwatch put-metric-alarm \
    --alarm-name "musehub-5xx-rate-high" \
    --alarm-description "5xx error rate exceeded ${THRESHOLD_5XX_RATE}% — investigate immediately" \
    --alarm-actions "$SNS_ARN" \
    --ok-actions "$SNS_ARN" \
    --metrics \
        '[{"Id":"e1","Expression":"(m1/m2)*100","Label":"5xxRate","ReturnData":true},
          {"Id":"m1","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"5xxCount"},"Period":60,"Stat":"Sum"},"ReturnData":false},
          {"Id":"m2","MetricStat":{"Metric":{"Namespace":"MuseHub","MetricName":"RequestCount"},"Period":60,"Stat":"Sum"},"ReturnData":false}]' \
    --comparison-operator GreaterThanThreshold \
    --threshold "$THRESHOLD_5XX_RATE" \
    --evaluation-periods 2 \
    --datapoints-to-alarm 2 \
    --treat-missing-data notBreaching

# ─ 4b: p99 request latency > 2 s ─────────────────────────────────────────────
aws_cmd cloudwatch put-metric-alarm \
    --alarm-name "musehub-p99-latency-high" \
    --alarm-description "p99 request latency exceeded ${THRESHOLD_P99_LATENCY_MS}ms" \
    --alarm-actions "$SNS_ARN" \
    --ok-actions "$SNS_ARN" \
    --namespace MuseHub \
    --metric-name RequestDurationMs \
    --statistic "p99" \
    --period 60 \
    --evaluation-periods 3 \
    --datapoints-to-alarm 2 \
    --threshold "$THRESHOLD_P99_LATENCY_MS" \
    --comparison-operator GreaterThanThreshold \
    --treat-missing-data notBreaching \
    --extended-statistic "p99"

# ─ 4c: Disk usage > 80% (CloudWatch Agent metric) ────────────────────────────
if [[ -n "$INSTANCE_ID" ]]; then
    aws_cmd cloudwatch put-metric-alarm \
        --alarm-name "musehub-disk-high" \
        --alarm-description "Disk usage exceeded ${THRESHOLD_DISK_PCT}% on $INSTANCE_ID" \
        --alarm-actions "$SNS_ARN" \
        --ok-actions "$SNS_ARN" \
        --namespace "CWAgent" \
        --metric-name "disk_used_percent" \
        --dimensions "Name=InstanceId,Value=$INSTANCE_ID" "Name=path,Value=/" \
        --statistic Average \
        --period 300 \
        --evaluation-periods 2 \
        --datapoints-to-alarm 2 \
        --threshold "$THRESHOLD_DISK_PCT" \
        --comparison-operator GreaterThanOrEqualToThreshold \
        --treat-missing-data missing
else
    log "  SKIPPING disk alarm: INSTANCE_ID not set (pass --instance-id i-xxx)"
fi

# ─ 4d: DB connections > 90% of max ──────────────────────────────────────────
# For self-hosted PostgreSQL on EC2, the CloudWatch Agent publishes
# postgresql_numconnections via the procstat plugin.
# For RDS, use AWS/RDS DatabaseConnections / DBParameterGroup max_connections.
#
# This alarm uses the RDS metric; adjust the namespace/metric if using
# the CloudWatch Agent with the postgresql procstat plugin.
aws_cmd cloudwatch put-metric-alarm \
    --alarm-name "musehub-db-connections-high" \
    --alarm-description "PostgreSQL connections exceeded ${THRESHOLD_DB_CONN_PCT}% of max_connections" \
    --alarm-actions "$SNS_ARN" \
    --ok-actions "$SNS_ARN" \
    --namespace "AWS/RDS" \
    --metric-name "DatabaseConnections" \
    --dimensions "Name=DBInstanceIdentifier,Value=$DB_INSTANCE" \
    --statistic Average \
    --period 60 \
    --evaluation-periods 3 \
    --datapoints-to-alarm 2 \
    --threshold 90 \
    --comparison-operator GreaterThanOrEqualToThreshold \
    --treat-missing-data missing

log "Alarms created: musehub-5xx-rate-high, musehub-p99-latency-high, musehub-disk-high, musehub-db-connections-high"

# ── Step 5: EC2 CloudWatch Agent config hint ──────────────────────────────────

log "[5/6] CloudWatch Agent note:"
log "  Install: sudo yum install -y amazon-cloudwatch-agent"
log "  Configure disk + memory metrics in /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json"
log "  Use log group: $LOG_GROUP"
log "  Start: sudo systemctl start amazon-cloudwatch-agent"

# ── Step 6: Summary ───────────────────────────────────────────────────────────

log "[6/6] Done."
log ""
log "  Log group  : $LOG_GROUP  (${LOG_RETENTION_DAYS}d hot; configure S3 export for 1-year cold)"
log "  SNS topic  : $SNS_ARN"
log "  Alarms     : 5xx rate > ${THRESHOLD_5XX_RATE}%  |  p99 > ${THRESHOLD_P99_LATENCY_MS}ms  |  disk > ${THRESHOLD_DISK_PCT}%  |  DB conns > ${THRESHOLD_DB_CONN_PCT}%"
log ""
log "  Verify with: aws cloudwatch describe-alarms --alarm-names musehub-5xx-rate-high musehub-p99-latency-high --region $AWS_REGION"