#!/usr/bin/env bash
# Zero-downtime blue-green deploy for MuseHub.
#
# Strategy:
#   Two slots — blue (port 1337) and green (port 1338).
#   The active slot serves traffic via nginx. The inactive slot is stopped.
#   Deploy:
#     1. Pull the new image from ECR (old slot keeps serving).
#     2. Run migrations against the live DB (before swap — forward-compatible).
#     3. Start the inactive slot with the new image.
#     4. Health-check the new slot.
#     5. Flip nginx to the new slot (nginx -s reload — instant, zero downtime).
#     6. Stop the old slot.
#
# Called by deploy/push.sh via SSM — do not run directly in production.
# For manual use on the instance:
#   ECR_IMAGE=992382692655.dkr.ecr.us-east-1.amazonaws.com/musehub/musehub \
#   IMAGE_TAG=<tag> bash deploy/deploy.sh
#
# First-time setup:
#   bash deploy/deploy.sh --init
#   (Initialises .active-slot and /etc/nginx/musehub-active-port if missing)

set -euo pipefail

APP_DIR="/opt/musehub"
DEPLOY_LOG="/tmp/musehub-deploy.log"

# Tee all output to a log file so push.sh can stream it live via a second SSM call.
exec > >(tee -a "$DEPLOY_LOG") 2>&1
echo "" >> "$DEPLOY_LOG"
echo "=== deploy started at $(date -u '+%Y-%m-%dT%H:%M:%SZ') ===" >> "$DEPLOY_LOG"
SLOT_FILE="$APP_DIR/.active-slot"
NGINX_PORT_FILE="/etc/nginx/musehub-active-port"
ECR_REGISTRY="992382692655.dkr.ecr.us-east-1.amazonaws.com"
ECR_IMAGE="${ECR_IMAGE:-${ECR_REGISTRY}/musehub/musehub}"
IMAGE_TAG="${IMAGE_TAG:-latest}"
FULL_IMAGE="${ECR_IMAGE}:${IMAGE_TAG}"
REGION="us-east-1"
HEALTH_URL_BLUE="http://127.0.0.1:1337/healthz"
HEALTH_URL_GREEN="http://127.0.0.1:1338/healthz"
HEALTH_RETRIES=30              # × 2s = 60s max wait

cd "$APP_DIR"

# ── Helpers ───────────────────────────────────────────────────────────────────

log()  { echo "[deploy] $*"; }
die()  { echo "[deploy] ERROR: $*" >&2; exit 1; }

health_check() {
    local url="$1"
    local slot="$2"
    log "Health-checking $slot at $url ..."
    for i in $(seq 1 "$HEALTH_RETRIES"); do
        if curl -sf --max-time 3 "$url" > /dev/null 2>&1; then
            log "$slot is healthy (attempt $i)"
            return 0
        fi
        sleep 2
    done
    die "$slot failed health check after $((HEALTH_RETRIES * 2))s"
}

nginx_point_to() {
    local slot="$1"
    sudo musehub-set-slot "$slot"
    log "nginx now pointing to $slot"
}

# Repair the active-port file if it contains a bare port number instead of
# a full nginx upstream directive.  Called once at startup so a botched
# manual intervention cannot be the root cause of a new deploy failing.
sanitize_nginx_port_file() {
    [ -f "$NGINX_PORT_FILE" ] || return 0
    local content
    content=$(cat "$NGINX_PORT_FILE")
    # Already correct — nothing to do
    if echo "$content" | grep -qE '^server 127\.0\.0\.1:[0-9]+;$'; then
        return 0
    fi
    # Derive correct slot from .active-slot file, or fall back to blue
    local slot
    slot=$(cat "$SLOT_FILE" 2>/dev/null || echo "blue")
    if [ "$slot" != "blue" ] && [ "$slot" != "green" ]; then
        slot="blue"
    fi
    log "WARNING: $NGINX_PORT_FILE has unexpected content — correcting via musehub-set-slot $slot"
    sudo musehub-set-slot "$slot"
    log "Sanitized active-port file; nginx reloaded."
}

# ── Init mode ─────────────────────────────────────────────────────────────────

if [ "${1:-}" = "--init" ]; then
    log "Init: installing musehub-set-slot and pointing nginx to blue"
    sudo cp "$APP_DIR/deploy/set-active-slot.sh" /usr/local/bin/musehub-set-slot
    sudo chmod +x /usr/local/bin/musehub-set-slot
    sudo musehub-set-slot blue
    log "Done. Run 'bash deploy/deploy.sh' (with ECR_IMAGE and IMAGE_TAG set) to deploy."
    exit 0
fi

# ── Validate required env vars ────────────────────────────────────────────────

[ -n "${ECR_IMAGE:-}" ] || die "ECR_IMAGE is not set."
[ -n "${IMAGE_TAG:-}" ] || die "IMAGE_TAG is not set."

# ── Read active slot ──────────────────────────────────────────────────────────

if [ ! -f "$SLOT_FILE" ]; then
    die ".active-slot not found. Run: bash deploy/deploy.sh --init"
fi

ACTIVE_SLOT=$(cat "$SLOT_FILE")
if [ "$ACTIVE_SLOT" = "blue" ]; then
    NEW_SLOT="green"
    NEW_PORT=1338
    OLD_CONTAINER="musehub-blue"
    NEW_CONTAINER="musehub-green"
    HEALTH_URL="$HEALTH_URL_GREEN"
else
    NEW_SLOT="blue"
    NEW_PORT=1337
    OLD_CONTAINER="musehub-green"
    NEW_CONTAINER="musehub-blue"
    HEALTH_URL="$HEALTH_URL_BLUE"
fi

log "Image:       $FULL_IMAGE"
log "Active slot: $ACTIVE_SLOT → deploying to: $NEW_SLOT (port $NEW_PORT)"

# Guard: ensure the nginx upstream file is well-formed before we touch anything.
sanitize_nginx_port_file

# ── Step 0: Apply nginx config if updated ────────────────────────────────────
# Determine the domain from the current installed config, re-substitute, and
# reload nginx if the content changed. Safe to run on every deploy.

NGINX_CONF_SRC="$APP_DIR/deploy/nginx-cf.conf"
NGINX_CONF_DEST="/etc/nginx/sites-available/musehub-staging"
NGINX_CONF_DEST_PROD="/etc/nginx/sites-available/musehub"

if [ -f "$NGINX_CONF_SRC" ]; then
    # Detect which installed config exists (staging vs prod)
    if [ -f "$NGINX_CONF_DEST" ]; then
        NGINX_CONF_INSTALLED="$NGINX_CONF_DEST"
    elif [ -f "$NGINX_CONF_DEST_PROD" ]; then
        NGINX_CONF_INSTALLED="$NGINX_CONF_DEST_PROD"
    else
        NGINX_CONF_INSTALLED=""
    fi

    if [ -n "$NGINX_CONF_INSTALLED" ]; then
        # Extract domain from the installed config (first server_name line)
        DOMAIN=$(grep -m1 'server_name' "$NGINX_CONF_INSTALLED" | awk '{print $2}' | tr -d ';')
        if [ -n "$DOMAIN" ]; then
            NEW_CONF=$(sed "s/DOMAIN_PLACEHOLDER/$DOMAIN/g" "$NGINX_CONF_SRC")
            CURRENT_CONF=$(cat "$NGINX_CONF_INSTALLED")
            if [ "$NEW_CONF" != "$CURRENT_CONF" ]; then
                log "[0/6] nginx config changed — applying update for $DOMAIN..."
                echo "$NEW_CONF" | sudo tee "$NGINX_CONF_INSTALLED" > /dev/null
                if sudo nginx -t 2>&1; then
                    sudo nginx -s reload
                    log "nginx config updated and reloaded."
                else
                    log "WARNING: new nginx config failed validation — reverting."
                    echo "$CURRENT_CONF" | sudo tee "$NGINX_CONF_INSTALLED" > /dev/null
                fi
            else
                log "[0/6] nginx config unchanged — skipping reload."
            fi
        fi
    fi
fi

# ── Step 1: Login to ECR and pull new image ───────────────────────────────────

log "[1/6] Pulling image from ECR..."
aws ecr get-login-password --region "$REGION" | \
    sudo docker login --username AWS --password-stdin "$ECR_REGISTRY"
sudo docker pull "$FULL_IMAGE"
log "Pull complete."

# ── Step 2: Run migrations against the live DB ────────────────────────────────

log "[2/6] Running migrations..."

_alembic() {
    sudo docker run --rm \
        --network musehub_musehub-internal \
        --env-file "$APP_DIR/.env" \
        -e SKIP_MIGRATIONS=0 \
        "$FULL_IMAGE" "$@"
}

# If upgrade head fails (e.g. stale revision ID from a migration history reset),
# stamp to the current head to re-anchor Alembic's tracking, then retry.
# The retry is a no-op when the schema already matches head.
if ! _alembic alembic upgrade head; then
    log "upgrade head failed — re-anchoring Alembic revision to head and retrying..."
    _alembic alembic stamp --purge head
    _alembic alembic upgrade head
fi
log "Migrations complete."

# Schema parity gate — hard fail.  Uses the same benign-diff filter as the S2
# test (alembic_version table, semantically-equivalent server_default variants,
# column comments) so spurious false positives never block a deploy.
_alembic python -m musehub.db.schema_gate \
    || die "Schema gate failed — ORM drift detected.  Write a migration (alembic revision --autogenerate) before deploying."

# ── Step 3: Start the new slot ────────────────────────────────────────────────

log "[3/6] Starting $NEW_SLOT on port $NEW_PORT..."

# Remove if a failed previous deploy left it around
sudo docker rm -f "$NEW_CONTAINER" 2>/dev/null || true

sudo docker run -d \
    --name "$NEW_CONTAINER" \
    --network musehub_musehub-internal \
    --env-file "$APP_DIR/.env" \
    -e SKIP_MIGRATIONS=1 \
    -v musehub_data:/data \
    -p "127.0.0.1:${NEW_PORT}:1337" \
    --restart unless-stopped \
    --log-driver awslogs \
    --log-opt awslogs-region=us-east-1 \
    --log-opt awslogs-group=/musehub/staging \
    --log-opt awslogs-stream="$NEW_CONTAINER" \
    --log-opt awslogs-create-group=true \
    "$FULL_IMAGE"

# ── Step 4: Health-check the new slot ────────────────────────────────────────

health_check "$HEALTH_URL" "$NEW_SLOT"

# ── Step 5: Flip nginx to the new slot (instant, zero downtime) ───────────────

log "[5/6] Switching nginx to $NEW_SLOT (port $NEW_PORT)..."
nginx_point_to "$NEW_SLOT"

# ── Step 6: Stop the old slot ────────────────────────────────────────────────

log "[6/6] Stopping old slot ($ACTIVE_SLOT)..."
sudo docker rm -f "$OLD_CONTAINER" 2>/dev/null || true

# ── Step 7: Restart the background worker ────────────────────────────────────

log "[7/7] Restarting background worker..."
sudo docker rm -f musehub-worker 2>/dev/null || true
sudo docker run -d \
    --name musehub-worker \
    --network musehub_musehub-internal \
    --env-file "$APP_DIR/.env" \
    -e SKIP_MIGRATIONS=1 \
    -v musehub_data:/data \
    --restart unless-stopped \
    --no-healthcheck \
    --log-driver awslogs \
    --log-opt awslogs-region=us-east-1 \
    --log-opt awslogs-group=/musehub/staging \
    --log-opt awslogs-stream=musehub-worker \
    --log-opt awslogs-create-group=true \
    "$FULL_IMAGE" python -m musehub.worker
log "Worker started."

# ── Step 8: Prune old images (keep last 3) ───────────────────────────────────

log "[8/8] Pruning old images (keeping last 3)..."
KEEP_IMAGES=3
OLD_IDS=$(sudo docker images "$ECR_IMAGE" --format "{{.ID}}" \
    | awk '!seen[$0]++' \
    | tail -n +$((KEEP_IMAGES + 1)))
if [ -n "$OLD_IDS" ]; then
    echo "$OLD_IDS" | xargs sudo docker rmi -f 2>/dev/null || true
    log "Image prune complete."
else
    log "No old images to prune."
fi

log ""
log "Deploy complete. Active slot: $NEW_SLOT (port $NEW_PORT)"
log "Image: $FULL_IMAGE"