#!/usr/bin/env bash # Zero-downtime blue-green deploy for MuseHub. # # Strategy: # Two slots — blue (port 1337) and green (port 1338). # The active slot serves traffic via nginx. The inactive slot is stopped. # Deploy: # 1. Pull the new image from ECR (old slot keeps serving). # 2. Run migrations against the live DB (before swap — forward-compatible). # 3. Start the inactive slot with the new image. # 4. Health-check the new slot. # 5. Flip nginx to the new slot (nginx -s reload — instant, zero downtime). # 6. Stop the old slot. # # Called by deploy/push.sh via SSM — do not run directly in production. # For manual use on the instance: # ECR_IMAGE=992382692655.dkr.ecr.us-east-1.amazonaws.com/musehub/musehub \ # IMAGE_TAG= bash deploy/deploy.sh # # First-time setup: # bash deploy/deploy.sh --init # (Initialises .active-slot and /etc/nginx/musehub-active-port if missing) set -euo pipefail APP_DIR="/opt/musehub" DEPLOY_LOG="/tmp/musehub-deploy.log" # Tee all output to a log file so push.sh can stream it live via a second SSM call. exec > >(tee -a "$DEPLOY_LOG") 2>&1 echo "" >> "$DEPLOY_LOG" echo "=== deploy started at $(date -u '+%Y-%m-%dT%H:%M:%SZ') ===" >> "$DEPLOY_LOG" SLOT_FILE="$APP_DIR/.active-slot" NGINX_PORT_FILE="/etc/nginx/musehub-active-port" ECR_REGISTRY="992382692655.dkr.ecr.us-east-1.amazonaws.com" ECR_IMAGE="${ECR_IMAGE:-${ECR_REGISTRY}/musehub/musehub}" IMAGE_TAG="${IMAGE_TAG:-latest}" FULL_IMAGE="${ECR_IMAGE}:${IMAGE_TAG}" REGION="us-east-1" HEALTH_URL_BLUE="http://127.0.0.1:1337/healthz" HEALTH_URL_GREEN="http://127.0.0.1:1338/healthz" HEALTH_RETRIES=30 # × 2s = 60s max wait cd "$APP_DIR" # ── Helpers ─────────────────────────────────────────────────────────────────── log() { echo "[deploy] $*"; } die() { echo "[deploy] ERROR: $*" >&2; exit 1; } health_check() { local url="$1" local slot="$2" log "Health-checking $slot at $url ..." for i in $(seq 1 "$HEALTH_RETRIES"); do if curl -sf --max-time 3 "$url" > /dev/null 2>&1; then log "$slot is healthy (attempt $i)" return 0 fi sleep 2 done die "$slot failed health check after $((HEALTH_RETRIES * 2))s" } nginx_point_to() { local slot="$1" sudo musehub-set-slot "$slot" log "nginx now pointing to $slot" } # Repair the active-port file if it contains a bare port number instead of # a full nginx upstream directive. Called once at startup so a botched # manual intervention cannot be the root cause of a new deploy failing. sanitize_nginx_port_file() { [ -f "$NGINX_PORT_FILE" ] || return 0 local content content=$(cat "$NGINX_PORT_FILE") # Already correct — nothing to do if echo "$content" | grep -qE '^server 127\.0\.0\.1:[0-9]+;$'; then return 0 fi # Derive correct slot from .active-slot file, or fall back to blue local slot slot=$(cat "$SLOT_FILE" 2>/dev/null || echo "blue") if [ "$slot" != "blue" ] && [ "$slot" != "green" ]; then slot="blue" fi log "WARNING: $NGINX_PORT_FILE has unexpected content — correcting via musehub-set-slot $slot" sudo musehub-set-slot "$slot" log "Sanitized active-port file; nginx reloaded." } # ── Init mode ───────────────────────────────────────────────────────────────── if [ "${1:-}" = "--init" ]; then log "Init: installing musehub-set-slot and pointing nginx to blue" sudo cp "$APP_DIR/deploy/set-active-slot.sh" /usr/local/bin/musehub-set-slot sudo chmod +x /usr/local/bin/musehub-set-slot sudo musehub-set-slot blue log "Done. Run 'bash deploy/deploy.sh' (with ECR_IMAGE and IMAGE_TAG set) to deploy." exit 0 fi # ── Validate required env vars ──────────────────────────────────────────────── [ -n "${ECR_IMAGE:-}" ] || die "ECR_IMAGE is not set." [ -n "${IMAGE_TAG:-}" ] || die "IMAGE_TAG is not set." # ── Read active slot ────────────────────────────────────────────────────────── if [ ! -f "$SLOT_FILE" ]; then die ".active-slot not found. Run: bash deploy/deploy.sh --init" fi ACTIVE_SLOT=$(cat "$SLOT_FILE") if [ "$ACTIVE_SLOT" = "blue" ]; then NEW_SLOT="green" NEW_PORT=1338 OLD_CONTAINER="musehub-blue" NEW_CONTAINER="musehub-green" HEALTH_URL="$HEALTH_URL_GREEN" else NEW_SLOT="blue" NEW_PORT=1337 OLD_CONTAINER="musehub-green" NEW_CONTAINER="musehub-blue" HEALTH_URL="$HEALTH_URL_BLUE" fi log "Image: $FULL_IMAGE" log "Active slot: $ACTIVE_SLOT → deploying to: $NEW_SLOT (port $NEW_PORT)" # Guard: ensure the nginx upstream file is well-formed before we touch anything. sanitize_nginx_port_file # ── Step 0: Apply nginx config if updated ──────────────────────────────────── # Determine the domain from the current installed config, re-substitute, and # reload nginx if the content changed. Safe to run on every deploy. NGINX_CONF_SRC="$APP_DIR/deploy/nginx-cf.conf" NGINX_CONF_DEST="/etc/nginx/sites-available/musehub-staging" NGINX_CONF_DEST_PROD="/etc/nginx/sites-available/musehub" if [ -f "$NGINX_CONF_SRC" ]; then # Detect which installed config exists (staging vs prod) if [ -f "$NGINX_CONF_DEST" ]; then NGINX_CONF_INSTALLED="$NGINX_CONF_DEST" elif [ -f "$NGINX_CONF_DEST_PROD" ]; then NGINX_CONF_INSTALLED="$NGINX_CONF_DEST_PROD" else NGINX_CONF_INSTALLED="" fi if [ -n "$NGINX_CONF_INSTALLED" ]; then # Extract domain from the installed config (first server_name line) DOMAIN=$(grep -m1 'server_name' "$NGINX_CONF_INSTALLED" | awk '{print $2}' | tr -d ';') if [ -n "$DOMAIN" ]; then NEW_CONF=$(sed "s/DOMAIN_PLACEHOLDER/$DOMAIN/g" "$NGINX_CONF_SRC") CURRENT_CONF=$(cat "$NGINX_CONF_INSTALLED") if [ "$NEW_CONF" != "$CURRENT_CONF" ]; then log "[0/6] nginx config changed — applying update for $DOMAIN..." echo "$NEW_CONF" | sudo tee "$NGINX_CONF_INSTALLED" > /dev/null if sudo nginx -t 2>&1; then sudo nginx -s reload log "nginx config updated and reloaded." else log "WARNING: new nginx config failed validation — reverting." echo "$CURRENT_CONF" | sudo tee "$NGINX_CONF_INSTALLED" > /dev/null fi else log "[0/6] nginx config unchanged — skipping reload." fi fi fi fi # ── Step 1: Login to ECR and pull new image ─────────────────────────────────── log "[1/6] Pulling image from ECR..." aws ecr get-login-password --region "$REGION" | \ sudo docker login --username AWS --password-stdin "$ECR_REGISTRY" sudo docker pull "$FULL_IMAGE" log "Pull complete." # ── Step 2: Run migrations against the live DB ──────────────────────────────── log "[2/6] Running migrations..." _alembic() { sudo docker run --rm \ --network musehub_musehub-internal \ --env-file "$APP_DIR/.env" \ -e SKIP_MIGRATIONS=0 \ "$FULL_IMAGE" "$@" } # If upgrade head fails (e.g. stale revision ID from a migration history reset), # stamp to the current head to re-anchor Alembic's tracking, then retry. # The retry is a no-op when the schema already matches head. if ! _alembic alembic upgrade head; then log "upgrade head failed — re-anchoring Alembic revision to head and retrying..." _alembic alembic stamp --purge head _alembic alembic upgrade head fi log "Migrations complete." # Schema parity gate — hard fail. Uses the same benign-diff filter as the S2 # test (alembic_version table, semantically-equivalent server_default variants, # column comments) so spurious false positives never block a deploy. _alembic python -m musehub.db.schema_gate \ || die "Schema gate failed — ORM drift detected. Write a migration (alembic revision --autogenerate) before deploying." # ── Step 3: Start the new slot ──────────────────────────────────────────────── log "[3/6] Starting $NEW_SLOT on port $NEW_PORT..." # Remove if a failed previous deploy left it around sudo docker rm -f "$NEW_CONTAINER" 2>/dev/null || true sudo docker run -d \ --name "$NEW_CONTAINER" \ --network musehub_musehub-internal \ --env-file "$APP_DIR/.env" \ -e SKIP_MIGRATIONS=1 \ -v musehub_data:/data \ -p "127.0.0.1:${NEW_PORT}:1337" \ --restart unless-stopped \ --log-driver awslogs \ --log-opt awslogs-region=us-east-1 \ --log-opt awslogs-group=/musehub/staging \ --log-opt awslogs-stream="$NEW_CONTAINER" \ --log-opt awslogs-create-group=true \ "$FULL_IMAGE" # ── Step 4: Health-check the new slot ──────────────────────────────────────── health_check "$HEALTH_URL" "$NEW_SLOT" # ── Step 5: Flip nginx to the new slot (instant, zero downtime) ─────────────── log "[5/6] Switching nginx to $NEW_SLOT (port $NEW_PORT)..." nginx_point_to "$NEW_SLOT" # ── Step 6: Stop the old slot ──────────────────────────────────────────────── log "[6/6] Stopping old slot ($ACTIVE_SLOT)..." sudo docker rm -f "$OLD_CONTAINER" 2>/dev/null || true # ── Step 7: Restart the background worker ──────────────────────────────────── log "[7/7] Restarting background worker..." sudo docker rm -f musehub-worker 2>/dev/null || true sudo docker run -d \ --name musehub-worker \ --network musehub_musehub-internal \ --env-file "$APP_DIR/.env" \ -e SKIP_MIGRATIONS=1 \ -v musehub_data:/data \ --restart unless-stopped \ --no-healthcheck \ --log-driver awslogs \ --log-opt awslogs-region=us-east-1 \ --log-opt awslogs-group=/musehub/staging \ --log-opt awslogs-stream=musehub-worker \ --log-opt awslogs-create-group=true \ "$FULL_IMAGE" python -m musehub.worker log "Worker started." # ── Step 8: Prune old images (keep last 3) ─────────────────────────────────── log "[8/8] Pruning old images (keeping last 3)..." KEEP_IMAGES=3 OLD_IDS=$(sudo docker images "$ECR_IMAGE" --format "{{.ID}}" \ | awk '!seen[$0]++' \ | tail -n +$((KEEP_IMAGES + 1))) if [ -n "$OLD_IDS" ]; then echo "$OLD_IDS" | xargs sudo docker rmi -f 2>/dev/null || true log "Image prune complete." else log "No old images to prune." fi log "" log "Deploy complete. Active slot: $NEW_SLOT (port $NEW_PORT)" log "Image: $FULL_IMAGE"