gabriel / musehub public
bot_throttle.py python
142 lines 5.9 KB
Raw
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32 fix: fall back to DB ancestry check when mpack-only fast-fo… Sonnet 4.6 patch 7 days ago
1 """ASGI middleware: throttle unauthenticated scrapers and headless bots.
2
3 Strategy
4 --------
5 Agents and humans are equal first-class principals on MuseHub. A request
6 that carries a valid ``Authorization: MSign ...`` header is a cryptographically
7 authenticated principal — the UA string is irrelevant. Bot throttling only
8 applies to *unauthenticated* requests.
9
10 1. ``Authorization: MSign ...`` present → pass through immediately.
11 Identity is proven; UA-based heuristics do not apply.
12 2. Exempt paths (``/healthz``, ``/static/*``, ``/mcp``) → pass through.
13 3. GET / HEAD requests → pass through without UA checks.
14 Read operations on public data are idempotent and safe. Per-route
15 slowapi limits already cap request rates; UA-based blocking adds no
16 meaningful protection here and would block legitimate tooling (curl,
17 scripts, agent monitors) reading public pages.
18 4. For all other methods: missing / blank User-Agent → 429.
19 5. For all other methods: known-bad User-Agent patterns → 429.
20 Stops commodity scanners cheaply before they reach the app.
21 6. All other traffic passes through unchanged.
22
23 This is a defence-in-depth layer on top of per-route slowapi limits. A
24 well-resourced attacker can spoof User-Agents, so this is not the primary
25 rate-limiting mechanism — it catches cheap noise on write paths before it
26 reaches the app.
27
28 Legitimate unauthenticated clients:
29 - Browsers (standard UA strings)
30 - curl / scripts reading public pages (GET/HEAD — always exempt from UA checks)
31 - muse CLI (sends ``muse/<version>``) — also authenticates via MSign
32
33 Legitimate authenticated clients (always bypass UA checks):
34 - muse CLI push/pull (MSign-signed wire-protocol requests)
35 - agentception workers (MSign-signed API calls)
36 - Any agent with a registered Ed25519 keypair
37 """
38
39 import re
40
41 from starlette.types import ASGIApp, Receive, Scope, Send
42
43 # Patterns that indicate commodity scrapers, vulnerability scanners, or
44 # automated tools with no legitimate interactive use on MuseHub.
45 _BAD_UA_PATTERNS: list[re.Pattern[str]] = [
46 re.compile(p, re.IGNORECASE)
47 for p in [
48 r"^python-requests/", # default requests UA — almost always automation
49 r"^Go-http-client/", # Go stdlib http — automation, no human browsing
50 r"^curl/", # default curl UA — scripts / scanners
51 r"^wget/", # wget
52 r"scrapy", # Scrapy framework
53 r"masscan", # port/content scanner
54 r"nikto", # web vulnerability scanner
55 r"sqlmap", # SQL injection tool
56 r"nmap", # network mapper
57 r"zgrab", # internet-wide scanner
58 r"nuclei", # security scanner
59 r"^(Java|Jakarta)/\d", # Java HttpURLConnection / Jakarta HTTP
60 r"libwww-perl", # Perl LWP automation
61 r"^WinHTTP", # Windows scripting
62 ]
63 ]
64
65 # Paths that should never be blocked by bot detection:
66 # /healthz — monitoring agents
67 # /static — CDN edge pre-fetch
68 # /mcp — MCP clients may have minimal UAs
69 # /install.sh — fetched by curl before muse is installed
70 # /uninstall.sh — same
71 # /releases/ — release binary downloads; install.sh fetches tarballs via curl
72 _EXEMPT_PREFIXES = ("/healthz", "/static/", "/mcp", "/api/health/", "/install.sh", "/uninstall.sh", "/releases/")
73
74 _429_START = {
75 "type": "http.response.start",
76 "status": 429,
77 "headers": [
78 (b"content-type", b"application/json"),
79 (b"retry-after", b"60"),
80 ],
81 }
82 _429_BODY = {
83 "type": "http.response.body",
84 "body": b'{"detail":"Request blocked: unrecognized client. Authenticate with MSign or use a registered client."}',
85 "more_body": False,
86 }
87
88 class BotThrottleMiddleware:
89 """Pure-ASGI bot throttle — rejects obvious scrapers before they hit the app."""
90
91 def __init__(self, app: ASGIApp) -> None:
92 self.app = app
93
94 async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
95 if scope["type"] != "http":
96 await self.app(scope, receive, send)
97 return
98
99 path: str = scope.get("path", "")
100
101 ua = ""
102 auth = ""
103 for name, value in scope.get("headers", ()):
104 if name == b"user-agent":
105 ua = value.decode("latin-1", errors="replace")
106 elif name == b"authorization":
107 auth = value.decode("latin-1", errors="replace")
108
109 # Authenticated principals bypass UA checks entirely.
110 # A valid MSign signature is cryptographic proof of identity —
111 # the UA string carries no additional signal.
112 if auth.startswith("MSign "):
113 await self.app(scope, receive, send)
114 return
115
116 # Exempt paths: monitoring probes, static assets, MCP clients.
117 if any(path.startswith(p) for p in _EXEMPT_PREFIXES):
118 await self.app(scope, receive, send)
119 return
120
121 # GET and HEAD are safe read-only methods — exempt from UA checks.
122 # Public data is meant to be publicly readable. Per-route slowapi
123 # limits already cap request rates on read paths.
124 method: str = scope.get("method", "")
125 if method in ("GET", "HEAD"):
126 await self.app(scope, receive, send)
127 return
128
129 # For all other methods (POST, PUT, PATCH, DELETE): require a UA.
130 # CloudFront strips the UA on /o/ CDN paths — exempt that prefix.
131 if not ua and not path.startswith("/o/"):
132 await send(_429_START)
133 await send(_429_BODY)
134 return
135
136 for pattern in _BAD_UA_PATTERNS:
137 if pattern.search(ua):
138 await send(_429_START)
139 await send(_429_BODY)
140 return
141
142 await self.app(scope, receive, send)
File History 1 commit
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32 fix: fall back to DB ancestry check when mpack-only fast-fo… Sonnet 4.6 patch 7 days ago