gabriel / musehub public
test_musehub_sitemap.py python
310 lines 11.3 KB
Raw
sha256:3ff9c9863a9891bdcde71b4a43228f66d0493e38b7cc1d09fe9eb7de774046b2 feat: add repair-commit wire endpoint (API parity with repa… Opus 4.8 minor ⚠ breaking 1 day ago
1 """Tests for the MuseHub sitemap.xml and robots.txt endpoints.
2
3 Covers acceptance criteria:
4 - test_sitemap_returns_xml — GET /sitemap.xml returns 200 with XML content-type
5 - test_sitemap_contains_static_pages — static explore/trending/topics URLs are always present
6 - test_sitemap_contains_public_repo — a seeded public repo appears in the sitemap
7 - test_sitemap_excludes_private_repo — private repos do NOT appear in the sitemap
8 - test_sitemap_contains_user_profile — seeded user profile URL appears in sitemap
9 - test_sitemap_contains_topic_urls — repo tags generate /topics/{tag} entries
10 - test_sitemap_contains_release_url — a release URL appears for repos with releases
11 - test_sitemap_xml_well_formed — sitemap can be parsed as valid XML
12 - test_sitemap_loc_uses_request_host — loc entries use the base URL from the request
13 - test_robots_txt_returns_plain_text — GET /robots.txt returns 200 text/plain
14 - test_robots_txt_allows_musehub_ui — Allow: / is present
15 - test_robots_txt_disallows_settings — settings path is disallowed
16 - test_robots_txt_disallows_api — /api/ directory is disallowed
17 - test_robots_txt_contains_sitemap_url — Sitemap: directive points to /sitemap.xml
18 - test_robots_txt_names_known_agents — known AI bots appear with explicit Allow
19 - test_robots_txt_no_auth_required — endpoint is accessible without authentication
20 - test_sitemap_no_auth_required — sitemap is accessible without authentication
21 """
22 from __future__ import annotations
23
24 import pytest
25 from httpx import AsyncClient
26 from sqlalchemy.ext.asyncio import AsyncSession
27 from xml.etree import ElementTree as ET
28
29 from musehub.core.genesis import compute_identity_id, compute_release_id, compute_repo_id
30 from musehub.db.musehub_identity_models import MusehubIdentity
31 from musehub.db.musehub_release_models import MusehubRelease
32 from musehub.db.musehub_repo_models import MusehubRepo
33
34
35 # ---------------------------------------------------------------------------
36 # Helpers
37 # ---------------------------------------------------------------------------
38
39
40 async def _make_public_repo(
41 db_session: AsyncSession,
42 *,
43 owner: str = "sitemap-user",
44 slug: str = "sitemap-repo",
45 tags: list[str] | None = None,
46 visibility: str = "public",
47 ) -> MusehubRepo:
48 """Seed a repo and return the ORM object."""
49 from datetime import datetime, timezone
50 created_at = datetime.now(tz=timezone.utc)
51 owner_id = compute_identity_id(owner.encode())
52 repo = MusehubRepo(
53 repo_id=compute_repo_id(owner_id, slug, "code", created_at.isoformat()),
54 name=slug,
55 owner=owner,
56 slug=slug,
57 visibility=visibility,
58 owner_user_id=owner_id,
59 description="test repo for sitemap",
60 tags=tags or [],
61 created_at=created_at,
62 updated_at=created_at,
63 )
64 db_session.add(repo)
65 await db_session.commit()
66 await db_session.refresh(repo)
67 return repo
68
69
70 async def _make_profile(
71 db_session: AsyncSession,
72 *,
73 username: str = "sitemap-user",
74 user_id: str = "sitemap-user-id",
75 ) -> MusehubIdentity:
76 """Seed a user identity and return the ORM object."""
77 identity = MusehubIdentity(
78 identity_id=user_id,
79 handle=username,
80 identity_type="human",
81 )
82 db_session.add(identity)
83 await db_session.commit()
84 await db_session.refresh(identity)
85 return identity
86
87
88 async def _make_release(
89 db_session: AsyncSession,
90 repo_id: str,
91 *,
92 tag: str = "v1.0",
93 ) -> MusehubRelease:
94 """Seed a release and return the ORM object."""
95 from datetime import datetime, timezone
96 _ts = datetime.now(tz=timezone.utc)
97 release = MusehubRelease(
98 release_id=compute_release_id(repo_id, tag, _ts.isoformat()),
99 repo_id=repo_id,
100 tag=tag,
101 title=f"Release {tag}",
102 body="",
103 author="sitemap-user",
104 )
105 db_session.add(release)
106 await db_session.commit()
107 await db_session.refresh(release)
108 return release
109
110
111 # ---------------------------------------------------------------------------
112 # Sitemap tests
113 # ---------------------------------------------------------------------------
114
115
116 async def test_sitemap_returns_xml(client: AsyncClient, db_session: AsyncSession) -> None:
117 """GET /sitemap.xml returns 200 with an XML content-type."""
118 response = await client.get("/sitemap.xml")
119 assert response.status_code == 200
120 assert "xml" in response.headers["content-type"]
121
122
123 async def test_sitemap_contains_static_pages(
124 client: AsyncClient, db_session: AsyncSession
125 ) -> None:
126 """Static explore and topics pages are always included in the sitemap."""
127 response = await client.get("/sitemap.xml")
128 assert response.status_code == 200
129 body = response.text
130 assert "/explore" in body
131 assert "/topics" in body
132
133
134 async def test_sitemap_contains_public_repo(
135 client: AsyncClient, db_session: AsyncSession
136 ) -> None:
137 """A seeded public repo's UI URL appears in the sitemap."""
138 await _make_public_repo(db_session, owner="artist", slug="cool-track")
139 response = await client.get("/sitemap.xml")
140 assert response.status_code == 200
141 body = response.text
142 assert "/artist/cool-track" in body
143
144
145 async def test_sitemap_excludes_private_repo(
146 client: AsyncClient, db_session: AsyncSession
147 ) -> None:
148 """Private repos must not appear anywhere in the sitemap."""
149 await _make_public_repo(db_session, owner="secretuser", slug="hidden-project", visibility="private")
150 response = await client.get("/sitemap.xml")
151 assert response.status_code == 200
152 body = response.text
153 assert "hidden-project" not in body
154 assert "secretuser" not in body
155
156
157 async def test_sitemap_contains_user_profile(
158 client: AsyncClient, db_session: AsyncSession
159 ) -> None:
160 """A seeded user profile generates a /users/{username} entry."""
161 await _make_profile(db_session, username="jazzmaster", user_id="jazzmaster-uid")
162 response = await client.get("/sitemap.xml")
163 assert response.status_code == 200
164 assert "/users/jazzmaster" in response.text
165
166
167 async def test_sitemap_contains_topic_urls(
168 client: AsyncClient, db_session: AsyncSession
169 ) -> None:
170 """Tags on public repos generate /topics/{tag} entries."""
171 await _make_public_repo(db_session, owner="producer", slug="beats", tags=["lo-fi", "jazz"])
172 response = await client.get("/sitemap.xml")
173 assert response.status_code == 200
174 body = response.text
175 assert "/topics/lo-fi" in body
176 assert "/topics/jazz" in body
177
178
179 async def test_sitemap_contains_release_url(
180 client: AsyncClient, db_session: AsyncSession
181 ) -> None:
182 """A release on a public repo generates a /releases/{tag} sitemap entry."""
183 repo = await _make_public_repo(db_session, owner="bandname", slug="debut-album")
184 await _make_release(db_session, repo.repo_id, tag="v1.0")
185 response = await client.get("/sitemap.xml")
186 assert response.status_code == 200
187 assert "/bandname/debut-album/releases/v1.0" in response.text
188
189
190 async def test_sitemap_xml_well_formed(
191 client: AsyncClient, db_session: AsyncSession
192 ) -> None:
193 """The sitemap response must be parseable as valid XML."""
194 response = await client.get("/sitemap.xml")
195 assert response.status_code == 200
196 # This raises if the document is not well-formed XML.
197 root = ET.fromstring(response.content)
198 assert root.tag.endswith("urlset")
199
200
201 async def test_sitemap_loc_uses_request_host(
202 client: AsyncClient, db_session: AsyncSession
203 ) -> None:
204 """loc entries in the sitemap use the base URL from the incoming request."""
205 await _make_public_repo(db_session, owner="testowner", slug="testrepo")
206 response = await client.get("/sitemap.xml")
207 assert response.status_code == 200
208 # The test client uses base_url="http://test" — every loc must start with http://test.
209 body = response.text
210 assert "<loc>http://test" in body
211
212
213 async def test_sitemap_no_auth_required(
214 client: AsyncClient, db_session: AsyncSession
215 ) -> None:
216 """Sitemap endpoint must be accessible without authentication (crawlers don't authenticate)."""
217 response = await client.get("/sitemap.xml")
218 assert response.status_code != 401
219 assert response.status_code == 200
220
221
222 async def test_sitemap_repo_commits_page_included(
223 client: AsyncClient, db_session: AsyncSession
224 ) -> None:
225 """Each public repo's /commits page also appears in the sitemap."""
226 await _make_public_repo(db_session, owner="composer", slug="symphony-no1")
227 response = await client.get("/sitemap.xml")
228 assert response.status_code == 200
229 assert "/composer/symphony-no1/commits" in response.text
230
231
232 async def test_sitemap_repo_issues_page_included(
233 client: AsyncClient, db_session: AsyncSession
234 ) -> None:
235 """Each public repo's /issues page also appears in the sitemap."""
236 await _make_public_repo(db_session, owner="composer", slug="symphony-no2")
237 response = await client.get("/sitemap.xml")
238 assert response.status_code == 200
239 assert "/composer/symphony-no2/issues" in response.text
240
241
242 # ---------------------------------------------------------------------------
243 # Robots.txt tests
244 # ---------------------------------------------------------------------------
245
246
247 async def test_robots_txt_returns_plain_text(
248 client: AsyncClient, db_session: AsyncSession
249 ) -> None:
250 """GET /robots.txt returns 200 with text/plain content-type."""
251 response = await client.get("/robots.txt")
252 assert response.status_code == 200
253 assert "text/plain" in response.headers["content-type"]
254
255
256 async def test_robots_txt_allows_musehub_ui(
257 client: AsyncClient, db_session: AsyncSession
258 ) -> None:
259 """Allow: / is present for all crawlers."""
260 response = await client.get("/robots.txt")
261 assert response.status_code == 200
262 assert "Allow: /" in response.text
263
264
265 async def test_robots_txt_disallows_settings(
266 client: AsyncClient, db_session: AsyncSession
267 ) -> None:
268 """Settings paths are disallowed to prevent indexing of private user config pages."""
269 response = await client.get("/robots.txt")
270 assert response.status_code == 200
271 assert "Disallow: /*/settings" in response.text
272
273
274 async def test_robots_txt_disallows_api(
275 client: AsyncClient, db_session: AsyncSession
276 ) -> None:
277 """API paths are disallowed — crawlers should use the sitemap, not the REST API."""
278 response = await client.get("/robots.txt")
279 assert response.status_code == 200
280 assert "Disallow: /api/" in response.text
281
282
283 async def test_robots_txt_contains_sitemap_url(
284 client: AsyncClient, db_session: AsyncSession
285 ) -> None:
286 """Sitemap: directive is present and points to /sitemap.xml."""
287 response = await client.get("/robots.txt")
288 assert response.status_code == 200
289 assert "Sitemap:" in response.text
290 assert "sitemap.xml" in response.text
291
292
293 async def test_robots_txt_names_known_agents(
294 client: AsyncClient, db_session: AsyncSession
295 ) -> None:
296 """Known AI discovery bots (GPTBot, ClaudeBot, etc.) appear with explicit Allow."""
297 response = await client.get("/robots.txt")
298 assert response.status_code == 200
299 body = response.text
300 for bot in ("GPTBot", "ClaudeBot", "Googlebot", "CursorBot"):
301 assert bot in body
302
303
304 async def test_robots_txt_no_auth_required(
305 client: AsyncClient, db_session: AsyncSession
306 ) -> None:
307 """robots.txt must be accessible without authentication."""
308 response = await client.get("/robots.txt")
309 assert response.status_code != 401
310 assert response.status_code == 200
File History 1 commit
sha256:3ff9c9863a9891bdcde71b4a43228f66d0493e38b7cc1d09fe9eb7de774046b2 feat: add repair-commit wire endpoint (API parity with repa… Opus 4.8 minor 1 day ago