gabriel / musehub public
deploy.sh bash
285 lines 11.2 KB
Raw
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32 fix: fall back to DB ancestry check when mpack-only fast-fo… Sonnet 4.6 patch 6 days ago
1 #!/usr/bin/env bash
2 # Zero-downtime blue-green deploy for MuseHub.
3 #
4 # Strategy:
5 # Two slots — blue (port 1337) and green (port 1338).
6 # The active slot serves traffic via nginx. The inactive slot is stopped.
7 # Deploy:
8 # 1. Pull the new image from ECR (old slot keeps serving).
9 # 2. Run migrations against the live DB (before swap — forward-compatible).
10 # 3. Start the inactive slot with the new image.
11 # 4. Health-check the new slot.
12 # 5. Flip nginx to the new slot (nginx -s reload — instant, zero downtime).
13 # 6. Stop the old slot.
14 #
15 # Called by deploy/push.sh via SSM — do not run directly in production.
16 # For manual use on the instance:
17 # ECR_IMAGE=992382692655.dkr.ecr.us-east-1.amazonaws.com/musehub/musehub \
18 # IMAGE_TAG=<tag> bash deploy/deploy.sh
19 #
20 # First-time setup:
21 # bash deploy/deploy.sh --init
22 # (Initialises .active-slot and /etc/nginx/musehub-active-port if missing)
23
24 set -euo pipefail
25
26 APP_DIR="/opt/musehub"
27 DEPLOY_LOG="/tmp/musehub-deploy.log"
28
29 # Tee all output to a log file so push.sh can stream it live via a second SSM call.
30 exec > >(tee -a "$DEPLOY_LOG") 2>&1
31 echo "" >> "$DEPLOY_LOG"
32 echo "=== deploy started at $(date -u '+%Y-%m-%dT%H:%M:%SZ') ===" >> "$DEPLOY_LOG"
33 SLOT_FILE="$APP_DIR/.active-slot"
34 NGINX_PORT_FILE="/etc/nginx/musehub-active-port"
35 ECR_REGISTRY="992382692655.dkr.ecr.us-east-1.amazonaws.com"
36 ECR_IMAGE="${ECR_IMAGE:-${ECR_REGISTRY}/musehub/musehub}"
37 IMAGE_TAG="${IMAGE_TAG:-latest}"
38 FULL_IMAGE="${ECR_IMAGE}:${IMAGE_TAG}"
39 REGION="us-east-1"
40 HEALTH_URL_BLUE="http://127.0.0.1:1337/healthz"
41 HEALTH_URL_GREEN="http://127.0.0.1:1338/healthz"
42 HEALTH_RETRIES=30 # × 2s = 60s max wait
43
44 cd "$APP_DIR"
45
46 # ── Helpers ───────────────────────────────────────────────────────────────────
47
48 log() { echo "[deploy] $*"; }
49 die() { echo "[deploy] ERROR: $*" >&2; exit 1; }
50
51 health_check() {
52 local url="$1"
53 local slot="$2"
54 log "Health-checking $slot at $url ..."
55 for i in $(seq 1 "$HEALTH_RETRIES"); do
56 if curl -sf --max-time 3 "$url" > /dev/null 2>&1; then
57 log "$slot is healthy (attempt $i)"
58 return 0
59 fi
60 sleep 2
61 done
62 die "$slot failed health check after $((HEALTH_RETRIES * 2))s"
63 }
64
65 nginx_point_to() {
66 local slot="$1"
67 sudo musehub-set-slot "$slot"
68 log "nginx now pointing to $slot"
69 }
70
71 # Repair the active-port file if it contains a bare port number instead of
72 # a full nginx upstream directive. Called once at startup so a botched
73 # manual intervention cannot be the root cause of a new deploy failing.
74 sanitize_nginx_port_file() {
75 [ -f "$NGINX_PORT_FILE" ] || return 0
76 local content
77 content=$(cat "$NGINX_PORT_FILE")
78 # Already correct — nothing to do
79 if echo "$content" | grep -qE '^server 127\.0\.0\.1:[0-9]+;$'; then
80 return 0
81 fi
82 # Derive correct slot from .active-slot file, or fall back to blue
83 local slot
84 slot=$(cat "$SLOT_FILE" 2>/dev/null || echo "blue")
85 if [ "$slot" != "blue" ] && [ "$slot" != "green" ]; then
86 slot="blue"
87 fi
88 log "WARNING: $NGINX_PORT_FILE has unexpected content — correcting via musehub-set-slot $slot"
89 sudo musehub-set-slot "$slot"
90 log "Sanitized active-port file; nginx reloaded."
91 }
92
93 # ── Init mode ─────────────────────────────────────────────────────────────────
94
95 if [ "${1:-}" = "--init" ]; then
96 log "Init: installing musehub-set-slot and pointing nginx to blue"
97 sudo cp "$APP_DIR/deploy/set-active-slot.sh" /usr/local/bin/musehub-set-slot
98 sudo chmod +x /usr/local/bin/musehub-set-slot
99 sudo musehub-set-slot blue
100 log "Done. Run 'bash deploy/deploy.sh' (with ECR_IMAGE and IMAGE_TAG set) to deploy."
101 exit 0
102 fi
103
104 # ── Validate required env vars ────────────────────────────────────────────────
105
106 [ -n "${ECR_IMAGE:-}" ] || die "ECR_IMAGE is not set."
107 [ -n "${IMAGE_TAG:-}" ] || die "IMAGE_TAG is not set."
108
109 # ── Read active slot ──────────────────────────────────────────────────────────
110
111 if [ ! -f "$SLOT_FILE" ]; then
112 die ".active-slot not found. Run: bash deploy/deploy.sh --init"
113 fi
114
115 ACTIVE_SLOT=$(cat "$SLOT_FILE")
116 if [ "$ACTIVE_SLOT" = "blue" ]; then
117 NEW_SLOT="green"
118 NEW_PORT=1338
119 OLD_CONTAINER="musehub-blue"
120 NEW_CONTAINER="musehub-green"
121 HEALTH_URL="$HEALTH_URL_GREEN"
122 else
123 NEW_SLOT="blue"
124 NEW_PORT=1337
125 OLD_CONTAINER="musehub-green"
126 NEW_CONTAINER="musehub-blue"
127 HEALTH_URL="$HEALTH_URL_BLUE"
128 fi
129
130 log "Image: $FULL_IMAGE"
131 log "Active slot: $ACTIVE_SLOT → deploying to: $NEW_SLOT (port $NEW_PORT)"
132
133 # Guard: ensure the nginx upstream file is well-formed before we touch anything.
134 sanitize_nginx_port_file
135
136 # ── Step 0: Apply nginx config if updated ────────────────────────────────────
137 # Determine the domain from the current installed config, re-substitute, and
138 # reload nginx if the content changed. Safe to run on every deploy.
139
140 NGINX_CONF_SRC="$APP_DIR/deploy/nginx-cf.conf"
141 NGINX_CONF_DEST="/etc/nginx/sites-available/musehub-staging"
142 NGINX_CONF_DEST_PROD="/etc/nginx/sites-available/musehub"
143
144 if [ -f "$NGINX_CONF_SRC" ]; then
145 # Detect which installed config exists (staging vs prod)
146 if [ -f "$NGINX_CONF_DEST" ]; then
147 NGINX_CONF_INSTALLED="$NGINX_CONF_DEST"
148 elif [ -f "$NGINX_CONF_DEST_PROD" ]; then
149 NGINX_CONF_INSTALLED="$NGINX_CONF_DEST_PROD"
150 else
151 NGINX_CONF_INSTALLED=""
152 fi
153
154 if [ -n "$NGINX_CONF_INSTALLED" ]; then
155 # Extract domain from the installed config (first server_name line)
156 DOMAIN=$(grep -m1 'server_name' "$NGINX_CONF_INSTALLED" | awk '{print $2}' | tr -d ';')
157 if [ -n "$DOMAIN" ]; then
158 NEW_CONF=$(sed "s/DOMAIN_PLACEHOLDER/$DOMAIN/g" "$NGINX_CONF_SRC")
159 CURRENT_CONF=$(cat "$NGINX_CONF_INSTALLED")
160 if [ "$NEW_CONF" != "$CURRENT_CONF" ]; then
161 log "[0/6] nginx config changed — applying update for $DOMAIN..."
162 echo "$NEW_CONF" | sudo tee "$NGINX_CONF_INSTALLED" > /dev/null
163 if sudo nginx -t 2>&1; then
164 sudo nginx -s reload
165 log "nginx config updated and reloaded."
166 else
167 log "WARNING: new nginx config failed validation — reverting."
168 echo "$CURRENT_CONF" | sudo tee "$NGINX_CONF_INSTALLED" > /dev/null
169 fi
170 else
171 log "[0/6] nginx config unchanged — skipping reload."
172 fi
173 fi
174 fi
175 fi
176
177 # ── Step 1: Login to ECR and pull new image ───────────────────────────────────
178
179 log "[1/6] Pulling image from ECR..."
180 aws ecr get-login-password --region "$REGION" | \
181 sudo docker login --username AWS --password-stdin "$ECR_REGISTRY"
182 sudo docker pull "$FULL_IMAGE"
183 log "Pull complete."
184
185 # ── Step 2: Run migrations against the live DB ────────────────────────────────
186
187 log "[2/6] Running migrations..."
188
189 _alembic() {
190 sudo docker run --rm \
191 --network musehub_musehub-internal \
192 --env-file "$APP_DIR/.env" \
193 -e SKIP_MIGRATIONS=0 \
194 "$FULL_IMAGE" "$@"
195 }
196
197 # If upgrade head fails (e.g. stale revision ID from a migration history reset),
198 # stamp to the current head to re-anchor Alembic's tracking, then retry.
199 # The retry is a no-op when the schema already matches head.
200 if ! _alembic alembic upgrade head; then
201 log "upgrade head failed — re-anchoring Alembic revision to head and retrying..."
202 _alembic alembic stamp --purge head
203 _alembic alembic upgrade head
204 fi
205 log "Migrations complete."
206
207 # Schema parity gate — hard fail. Uses the same benign-diff filter as the S2
208 # test (alembic_version table, semantically-equivalent server_default variants,
209 # column comments) so spurious false positives never block a deploy.
210 _alembic python -m musehub.db.schema_gate \
211 || die "Schema gate failed — ORM drift detected. Write a migration (alembic revision --autogenerate) before deploying."
212
213 # ── Step 3: Start the new slot ────────────────────────────────────────────────
214
215 log "[3/6] Starting $NEW_SLOT on port $NEW_PORT..."
216
217 # Remove if a failed previous deploy left it around
218 sudo docker rm -f "$NEW_CONTAINER" 2>/dev/null || true
219
220 sudo docker run -d \
221 --name "$NEW_CONTAINER" \
222 --network musehub_musehub-internal \
223 --env-file "$APP_DIR/.env" \
224 -e SKIP_MIGRATIONS=1 \
225 -v musehub_data:/data \
226 -p "127.0.0.1:${NEW_PORT}:1337" \
227 --restart unless-stopped \
228 --log-driver awslogs \
229 --log-opt awslogs-region=us-east-1 \
230 --log-opt awslogs-group=/musehub/staging \
231 --log-opt awslogs-stream="$NEW_CONTAINER" \
232 --log-opt awslogs-create-group=true \
233 "$FULL_IMAGE"
234
235 # ── Step 4: Health-check the new slot ────────────────────────────────────────
236
237 health_check "$HEALTH_URL" "$NEW_SLOT"
238
239 # ── Step 5: Flip nginx to the new slot (instant, zero downtime) ───────────────
240
241 log "[5/6] Switching nginx to $NEW_SLOT (port $NEW_PORT)..."
242 nginx_point_to "$NEW_SLOT"
243
244 # ── Step 6: Stop the old slot ────────────────────────────────────────────────
245
246 log "[6/6] Stopping old slot ($ACTIVE_SLOT)..."
247 sudo docker rm -f "$OLD_CONTAINER" 2>/dev/null || true
248
249 # ── Step 7: Restart the background worker ────────────────────────────────────
250
251 log "[7/7] Restarting background worker..."
252 sudo docker rm -f musehub-worker 2>/dev/null || true
253 sudo docker run -d \
254 --name musehub-worker \
255 --network musehub_musehub-internal \
256 --env-file "$APP_DIR/.env" \
257 -e SKIP_MIGRATIONS=1 \
258 -v musehub_data:/data \
259 --restart unless-stopped \
260 --no-healthcheck \
261 --log-driver awslogs \
262 --log-opt awslogs-region=us-east-1 \
263 --log-opt awslogs-group=/musehub/staging \
264 --log-opt awslogs-stream=musehub-worker \
265 --log-opt awslogs-create-group=true \
266 "$FULL_IMAGE" python -m musehub.worker
267 log "Worker started."
268
269 # ── Step 8: Prune old images (keep last 3) ───────────────────────────────────
270
271 log "[8/8] Pruning old images (keeping last 3)..."
272 KEEP_IMAGES=3
273 OLD_IDS=$(sudo docker images "$ECR_IMAGE" --format "{{.ID}}" \
274 | awk '!seen[$0]++' \
275 | tail -n +$((KEEP_IMAGES + 1)))
276 if [ -n "$OLD_IDS" ]; then
277 echo "$OLD_IDS" | xargs sudo docker rmi -f 2>/dev/null || true
278 log "Image prune complete."
279 else
280 log "No old images to prune."
281 fi
282
283 log ""
284 log "Deploy complete. Active slot: $NEW_SLOT (port $NEW_PORT)"
285 log "Image: $FULL_IMAGE"
File History 1 commit
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32 fix: fall back to DB ancestry check when mpack-only fast-fo… Sonnet 4.6 patch 6 days ago