{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/fetchers\/sitemap_loader.py",
"content": "\"\"\"Discovers URLs from sitemap.xml \/ sitemap_index.xml via httpx.\"\"\"\nfrom __future__ import annotations\n\nimport re\nfrom typing import Self\nfrom urllib.parse import urljoin\n\nimport httpx\n\nfrom src.infrastructure.config.constants import HTTP_TIMEOUT_SEC, USER_AGENT\n\n_LOC_RE = re.compile(r\"<loc>\\s*([^<\\s]+)\\s*<\/loc>\", re.IGNORECASE)\n_CANDIDATE_PATHS = (\"sitemap_index.xml\", \"wp-sitemap.xml\", \"sitemap.xml\")\n\n\nclass SitemapLoader:\n def __init__(self, client: httpx.AsyncClient) -> None:\n self._client = client\n\n async def discover(self, base_url: str) -> tuple[str, ...]:\n visited: set[str] = set()\n page_urls: set[str] = set()\n queue: list[str] = [urljoin(base_url, p) for p in _CANDIDATE_PATHS]\n while queue:\n sm_url = queue.pop()\n if sm_url in visited:\n continue\n visited.add(sm_url)\n locs = await self._fetch_locs(sm_url)\n for loc in locs:\n if _looks_like_sitemap(loc):\n queue.append(loc)\n else:\n page_urls.add(loc)\n return tuple(sorted(page_urls))\n\n async def _fetch_locs(self, sitemap_url: str) -> list[str]:\n try:\n response = await self._client.get(sitemap_url)\n except httpx.HTTPError:\n return []\n if response.status_code != httpx.codes.OK:\n return []\n return _LOC_RE.findall(response.text)\n\n @classmethod\n def create(cls) -> Self:\n return cls(httpx.AsyncClient(\n timeout=HTTP_TIMEOUT_SEC,\n follow_redirects=True,\n headers={\"User-Agent\": USER_AGENT},\n ))\n\n async def close(self) -> None:\n await self._client.aclose()\n\n\ndef _looks_like_sitemap(url: str) -> bool:\n lowered = url.lower()\n return lowered.endswith(\".xml\") or \"\/sitemap\" in lowered\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/fetchers\/sitemap_loader.py",
"content": "\"\"\"Discovers URLs from sitemap.xml \/ sitemap_index.xml via httpx.\"\"\"\nfrom __future__ import annotations\n\nimport re\nfrom typing import Self\nfrom urllib.parse import urljoin\n\nimport httpx\n\nfrom src.infrastructure.config.constants import HTTP_TIMEOUT_SEC, USER_AGENT\n\n_LOC_RE = re.compile(r\"<loc>\\s*([^<\\s]+)\\s*<\/loc>\", re.IGNORECASE)\n_CANDIDATE_PATHS = (\"sitemap_index.xml\", \"wp-sitemap.xml\", \"sitemap.xml\")\n\n\nclass SitemapLoader:\n def __init__(self, client: httpx.AsyncClient) -> None:\n self._client = client\n\n async def discover(self, base_url: str) -> tuple[str, ...]:\n visited: set[str] = set()\n page_urls: set[str] = set()\n queue: list[str] = [urljoin(base_url, p) for p in _CANDIDATE_PATHS]\n while queue:\n sm_url = queue.pop()\n if sm_url in visited:\n continue\n visited.add(sm_url)\n locs = await self._fetch_locs(sm_url)\n for loc in locs:\n if _looks_like_sitemap(loc):\n queue.append(loc)\n else:\n page_urls.add(loc)\n return tuple(sorted(page_urls))\n\n async def _fetch_locs(self, sitemap_url: str) -> list[str]:\n try:\n response = await self._client.get(sitemap_url)\n except httpx.HTTPError:\n return []\n if response.status_code != httpx.codes.OK:\n return []\n return _LOC_RE.findall(response.text)\n\n @classmethod\n def create(cls) -> Self:\n return cls(httpx.AsyncClient(\n timeout=HTTP_TIMEOUT_SEC,\n follow_redirects=True,\n headers={\"User-Agent\": USER_AGENT},\n ))\n\n async def close(self) -> None:\n await self._client.aclose()\n\n\ndef _looks_like_sitemap(url: str) -> bool:\n lowered = url.lower()\n return lowered.endswith(\".xml\") or \"\/sitemap\" in lowered\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}