bot_throttle.py
python
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32
fix: fall back to DB ancestry check when mpack-only fast-fo…
Sonnet 4.6
patch
7 days ago
| 1 | """ASGI middleware: throttle unauthenticated scrapers and headless bots. |
| 2 | |
| 3 | Strategy |
| 4 | -------- |
| 5 | Agents and humans are equal first-class principals on MuseHub. A request |
| 6 | that carries a valid ``Authorization: MSign ...`` header is a cryptographically |
| 7 | authenticated principal — the UA string is irrelevant. Bot throttling only |
| 8 | applies to *unauthenticated* requests. |
| 9 | |
| 10 | 1. ``Authorization: MSign ...`` present → pass through immediately. |
| 11 | Identity is proven; UA-based heuristics do not apply. |
| 12 | 2. Exempt paths (``/healthz``, ``/static/*``, ``/mcp``) → pass through. |
| 13 | 3. GET / HEAD requests → pass through without UA checks. |
| 14 | Read operations on public data are idempotent and safe. Per-route |
| 15 | slowapi limits already cap request rates; UA-based blocking adds no |
| 16 | meaningful protection here and would block legitimate tooling (curl, |
| 17 | scripts, agent monitors) reading public pages. |
| 18 | 4. For all other methods: missing / blank User-Agent → 429. |
| 19 | 5. For all other methods: known-bad User-Agent patterns → 429. |
| 20 | Stops commodity scanners cheaply before they reach the app. |
| 21 | 6. All other traffic passes through unchanged. |
| 22 | |
| 23 | This is a defence-in-depth layer on top of per-route slowapi limits. A |
| 24 | well-resourced attacker can spoof User-Agents, so this is not the primary |
| 25 | rate-limiting mechanism — it catches cheap noise on write paths before it |
| 26 | reaches the app. |
| 27 | |
| 28 | Legitimate unauthenticated clients: |
| 29 | - Browsers (standard UA strings) |
| 30 | - curl / scripts reading public pages (GET/HEAD — always exempt from UA checks) |
| 31 | - muse CLI (sends ``muse/<version>``) — also authenticates via MSign |
| 32 | |
| 33 | Legitimate authenticated clients (always bypass UA checks): |
| 34 | - muse CLI push/pull (MSign-signed wire-protocol requests) |
| 35 | - agentception workers (MSign-signed API calls) |
| 36 | - Any agent with a registered Ed25519 keypair |
| 37 | """ |
| 38 | |
| 39 | import re |
| 40 | |
| 41 | from starlette.types import ASGIApp, Receive, Scope, Send |
| 42 | |
| 43 | # Patterns that indicate commodity scrapers, vulnerability scanners, or |
| 44 | # automated tools with no legitimate interactive use on MuseHub. |
| 45 | _BAD_UA_PATTERNS: list[re.Pattern[str]] = [ |
| 46 | re.compile(p, re.IGNORECASE) |
| 47 | for p in [ |
| 48 | r"^python-requests/", # default requests UA — almost always automation |
| 49 | r"^Go-http-client/", # Go stdlib http — automation, no human browsing |
| 50 | r"^curl/", # default curl UA — scripts / scanners |
| 51 | r"^wget/", # wget |
| 52 | r"scrapy", # Scrapy framework |
| 53 | r"masscan", # port/content scanner |
| 54 | r"nikto", # web vulnerability scanner |
| 55 | r"sqlmap", # SQL injection tool |
| 56 | r"nmap", # network mapper |
| 57 | r"zgrab", # internet-wide scanner |
| 58 | r"nuclei", # security scanner |
| 59 | r"^(Java|Jakarta)/\d", # Java HttpURLConnection / Jakarta HTTP |
| 60 | r"libwww-perl", # Perl LWP automation |
| 61 | r"^WinHTTP", # Windows scripting |
| 62 | ] |
| 63 | ] |
| 64 | |
| 65 | # Paths that should never be blocked by bot detection: |
| 66 | # /healthz — monitoring agents |
| 67 | # /static — CDN edge pre-fetch |
| 68 | # /mcp — MCP clients may have minimal UAs |
| 69 | # /install.sh — fetched by curl before muse is installed |
| 70 | # /uninstall.sh — same |
| 71 | # /releases/ — release binary downloads; install.sh fetches tarballs via curl |
| 72 | _EXEMPT_PREFIXES = ("/healthz", "/static/", "/mcp", "/api/health/", "/install.sh", "/uninstall.sh", "/releases/") |
| 73 | |
| 74 | _429_START = { |
| 75 | "type": "http.response.start", |
| 76 | "status": 429, |
| 77 | "headers": [ |
| 78 | (b"content-type", b"application/json"), |
| 79 | (b"retry-after", b"60"), |
| 80 | ], |
| 81 | } |
| 82 | _429_BODY = { |
| 83 | "type": "http.response.body", |
| 84 | "body": b'{"detail":"Request blocked: unrecognized client. Authenticate with MSign or use a registered client."}', |
| 85 | "more_body": False, |
| 86 | } |
| 87 | |
| 88 | class BotThrottleMiddleware: |
| 89 | """Pure-ASGI bot throttle — rejects obvious scrapers before they hit the app.""" |
| 90 | |
| 91 | def __init__(self, app: ASGIApp) -> None: |
| 92 | self.app = app |
| 93 | |
| 94 | async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: |
| 95 | if scope["type"] != "http": |
| 96 | await self.app(scope, receive, send) |
| 97 | return |
| 98 | |
| 99 | path: str = scope.get("path", "") |
| 100 | |
| 101 | ua = "" |
| 102 | auth = "" |
| 103 | for name, value in scope.get("headers", ()): |
| 104 | if name == b"user-agent": |
| 105 | ua = value.decode("latin-1", errors="replace") |
| 106 | elif name == b"authorization": |
| 107 | auth = value.decode("latin-1", errors="replace") |
| 108 | |
| 109 | # Authenticated principals bypass UA checks entirely. |
| 110 | # A valid MSign signature is cryptographic proof of identity — |
| 111 | # the UA string carries no additional signal. |
| 112 | if auth.startswith("MSign "): |
| 113 | await self.app(scope, receive, send) |
| 114 | return |
| 115 | |
| 116 | # Exempt paths: monitoring probes, static assets, MCP clients. |
| 117 | if any(path.startswith(p) for p in _EXEMPT_PREFIXES): |
| 118 | await self.app(scope, receive, send) |
| 119 | return |
| 120 | |
| 121 | # GET and HEAD are safe read-only methods — exempt from UA checks. |
| 122 | # Public data is meant to be publicly readable. Per-route slowapi |
| 123 | # limits already cap request rates on read paths. |
| 124 | method: str = scope.get("method", "") |
| 125 | if method in ("GET", "HEAD"): |
| 126 | await self.app(scope, receive, send) |
| 127 | return |
| 128 | |
| 129 | # For all other methods (POST, PUT, PATCH, DELETE): require a UA. |
| 130 | # CloudFront strips the UA on /o/ CDN paths — exempt that prefix. |
| 131 | if not ua and not path.startswith("/o/"): |
| 132 | await send(_429_START) |
| 133 | await send(_429_BODY) |
| 134 | return |
| 135 | |
| 136 | for pattern in _BAD_UA_PATTERNS: |
| 137 | if pattern.search(ua): |
| 138 | await send(_429_START) |
| 139 | await send(_429_BODY) |
| 140 | return |
| 141 | |
| 142 | await self.app(scope, receive, send) |
File History
1 commit
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32
fix: fall back to DB ancestry check when mpack-only fast-fo…
Sonnet 4.6
patch
7 days ago