Protokoll #33890

ID33890
Zeitstempel2026-04-21 03:42:54.088778
Clientroot
IP62.246.34.195
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens4,266 (Input: 1,144, Output: 3,122)
Dauer62 ms
Request-Zeit2026-04-21 03:42:54.088778
Response-Zeit2026-04-21 03:42:54.150904

Request

{
    "event": "PreToolUse",
    "tool_name": "Write",
    "tool_input": {
        "file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/commands\/crawl.py",
        "content": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import Awaitable, Callable\nfrom dataclasses import dataclass\nfrom datetime import datetime\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher\nfrom src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter\nfrom src.infrastructure.fetchers.sitemap_loader import SitemapLoader\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\n@dataclass(frozen=True, slots=True)\nclass CrawlCliInput:\n    base_url: str\n    mode: CrawlMode\n    trigger: TriggerSource\n    concurrency: int = HTTP_CONCURRENCY\n    max_urls: int | None = None\n    seed_from_sitemap: bool = True\n\n\n@dataclass(frozen=True, slots=True)\nclass _FetcherChoice:\n    fetcher: HttpFetcherPort\n    render_mode: RenderMode\n    concurrency: int\n    close: Callable[[], Awaitable[None]]\n\n\nasync def _build_fetcher(command: CrawlCliInput) -> _FetcherChoice:\n    if command.mode is CrawlMode.FAST:\n        http = HttpxFetcher.create()\n        return _FetcherChoice(http, RenderMode.HTTP, command.concurrency, http.close)\n    browser = await PlaywrightFetcher.create()\n    adapter = PlaywrightHttpAdapter(browser)\n    concurrency = min(command.concurrency, PLAYWRIGHT_CONCURRENCY)\n    return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)\n\n\nasync def _seed_from_sitemap(container: Container, *, crawl_id: int, base_url: str) -> int:\n    loader = SitemapLoader.create()\n    try:\n        discovered = await loader.discover(base_url)\n    finally:\n        await loader.close()\n    registered = urlsplit(base_url).hostname or \"\"\n    now = container.clock.now()\n    enqueued = 0\n    for raw in discovered:\n        normalized = container.normalizer.normalize(raw)\n        if not _is_internal(normalized.host, registered):\n            continue\n        container.repos.urls.upsert(_seed_url(normalized, now))\n        if container.queue.enqueue(crawl_id, normalized):\n            enqueued += 1\n    return enqueued\n\n\ndef _is_internal(host: str, registered: str) -> bool:\n    return host == registered or host.endswith(\".\" + registered)\n\n\ndef _seed_url(normalized: object, now: datetime) -> Url:\n    return Url(\n        id=None,\n        normalized=normalized,  # type: ignore[arg-type]\n        is_internal=True,\n        first_seen_at=now,\n        last_seen_at=now,\n    )\n\n\nasync def run_crawl(container: Container, command: CrawlCliInput) -> int:\n    crawl = container.start_crawl().execute(\n        StartCrawlInput(\n            base_url=command.base_url, mode=command.mode,\n            trigger_source=command.trigger, config={},\n        ),\n    )\n    if crawl.id is None:\n        raise RuntimeError(\"start_crawl did not assign id\")\n    if command.seed_from_sitemap:\n        await _seed_from_sitemap(container, crawl_id=crawl.id, base_url=command.base_url)\n    choice = await _build_fetcher(command)\n    registered_domain = urlsplit(command.base_url).hostname or \"\"\n    try:\n        process_uc = container.process_url(\n            choice.fetcher,\n            registered_domain=registered_domain,\n            render_mode=choice.render_mode,\n        )\n        worker = QueueWorker(\n            queue=container.queue, process_url=process_uc,\n            errors=container.repos.errors, clock=container.clock,\n        )\n        processed = await worker.run(WorkerConfig(\n            crawl_id=crawl.id, concurrency=choice.concurrency, max_urls=command.max_urls,\n        ))\n    finally:\n        await choice.close()\n    container.finalize().execute(\n        crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n    )\n    return crawl.id\n\n\ndef sync_run_crawl(container: Container, command: CrawlCliInput) -> int:\n    return asyncio.run(run_crawl(container, command))\n"
    }
}

Response

{
    "tool_response": {
        "type": "update",
        "filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/commands\/crawl.py",
        "content": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import Awaitable, Callable\nfrom dataclasses import dataclass\nfrom datetime import datetime\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher\nfrom src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter\nfrom src.infrastructure.fetchers.sitemap_loader import SitemapLoader\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\n@dataclass(frozen=True, slots=True)\nclass CrawlCliInput:\n    base_url: str\n    mode: CrawlMode\n    trigger: TriggerSource\n    concurrency: int = HTTP_CONCURRENCY\n    max_urls: int | None = None\n    seed_from_sitemap: bool = True\n\n\n@dataclass(frozen=True, slots=True)\nclass _FetcherChoice:\n    fetcher: HttpFetcherPort\n    render_mode: RenderMode\n    concurrency: int\n    close: Callable[[], Awaitable[None]]\n\n\nasync def _build_fetcher(command: CrawlCliInput) -> _FetcherChoice:\n    if command.mode is CrawlMode.FAST:\n        http = HttpxFetcher.create()\n        return _FetcherChoice(http, RenderMode.HTTP, command.concurrency, http.close)\n    browser = await PlaywrightFetcher.create()\n    adapter = PlaywrightHttpAdapter(browser)\n    concurrency = min(command.concurrency, PLAYWRIGHT_CONCURRENCY)\n    return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)\n\n\nasync def _seed_from_sitemap(container: Container, *, crawl_id: int, base_url: str) -> int:\n    loader = SitemapLoader.create()\n    try:\n        discovered = await loader.discover(base_url)\n    finally:\n        await loader.close()\n    registered = urlsplit(base_url).hostname or \"\"\n    now = container.clock.now()\n    enqueued = 0\n    for raw in discovered:\n        normalized = container.normalizer.normalize(raw)\n        if not _is_internal(normalized.host, registered):\n            continue\n        container.repos.urls.upsert(_seed_url(normalized, now))\n        if container.queue.enqueue(crawl_id, normalized):\n            enqueued += 1\n    return enqueued\n\n\ndef _is_internal(host: str, registered: str) -> bool:\n    return host == registered or host.endswith(\".\" + registered)\n\n\ndef _seed_url(normalized: object, now: datetime) -> Url:\n    return Url(\n        id=None,\n        normalized=normalized,  # type: ignore[arg-type]\n        is_internal=True,\n        first_seen_at=now,\n        last_seen_at=now,\n    )\n\n\nasync def run_crawl(container: Container, command: CrawlCliInput) -> int:\n    crawl = container.start_crawl().execute(\n        StartCrawlInput(\n            base_url=command.base_url, mode=command.mode,\n            trigger_source=command.trigger, config={},\n        ),\n    )\n    if crawl.id is None:\n        raise RuntimeError(\"start_crawl did not assign id\")\n    if command.seed_from_sitemap:\n        await _seed_from_sitemap(container, crawl_id=crawl.id, base_url=command.base_url)\n    choice = await _build_fetcher(command)\n    registered_domain = urlsplit(command.base_url).hostname or \"\"\n    try:\n        process_uc = container.process_url(\n            choice.fetcher,\n            registered_domain=registered_domain,\n            render_mode=choice.render_mode,\n        )\n        worker = QueueWorker(\n            queue=container.queue, process_url=process_uc,\n            errors=container.repos.errors, clock=container.clock,\n        )\n        processed = await worker.run(WorkerConfig(\n            crawl_id=crawl.id, concurrency=choice.concurrency, max_urls=command.max_urls,\n        ))\n    finally:\n        await choice.close()\n    container.finalize().execute(\n        crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n    )\n    return crawl.id\n\n\ndef sync_run_crawl(container: Container, command: CrawlCliInput) -> int:\n    return asyncio.run(run_crawl(container, command))\n",
        "structuredPatch": [
            {
                "oldStart": 3,
                "oldLines": 15,
                "newStart": 3,
                "newLines": 18,
                "lines": [
                    " import asyncio",
                    " from collections.abc import Awaitable, Callable",
                    " from dataclasses import dataclass",
                    "+from datetime import datetime",
                    " from urllib.parse import urlsplit",
                    " ",
                    " from src.application.use_cases.start_crawl import StartCrawlInput",
                    "+from src.domain.entities.url import Url",
                    " from src.domain.ports.http_fetcher_port import HttpFetcherPort",
                    " from src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource",
                    " from src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY",
                    " from src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher",
                    " from src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher",
                    " from src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter",
                    "+from src.infrastructure.fetchers.sitemap_loader import SitemapLoader",
                    " from src.interfaces.cli.wiring import Container",
                    " from src.interfaces.cli.worker import QueueWorker, WorkerConfig",
                    " "
                ]
            },
            {
                "oldStart": 23,
                "oldLines": 6,
                "newStart": 26,
                "newLines": 7,
                "lines": [
                    "     trigger: TriggerSource",
                    "     concurrency: int = HTTP_CONCURRENCY",
                    "     max_urls: int | None = None",
                    "+    seed_from_sitemap: bool = True",
                    " ",
                    " ",
                    " @dataclass(frozen=True, slots=True)"
                ]
            },
            {
                "oldStart": 43,
                "oldLines": 17,
                "newStart": 47,
                "newLines": 50,
                "lines": [
                    "     return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)",
                    " ",
                    " ",
                    "+async def _seed_from_sitemap(container: Container, *, crawl_id: int, base_url: str) -> int:",
                    "+    loader = SitemapLoader.create()",
                    "+    try:",
                    "+        discovered = await loader.discover(base_url)",
                    "+    finally:",
                    "+        await loader.close()",
                    "+    registered = urlsplit(base_url).hostname or \"\"",
                    "+    now = container.clock.now()",
                    "+    enqueued = 0",
                    "+    for raw in discovered:",
                    "+        normalized = container.normalizer.normalize(raw)",
                    "+        if not _is_internal(normalized.host, registered):",
                    "+            continue",
                    "+        container.repos.urls.upsert(_seed_url(normalized, now))",
                    "+        if container.queue.enqueue(crawl_id, normalized):",
                    "+            enqueued += 1",
                    "+    return enqueued",
                    "+",
                    "+",
                    "+def _is_internal(host: str, registered: str) -> bool:",
                    "+    return host == registered or host.endswith(\".\" + registered)",
                    "+",
                    "+",
                    "+def _seed_url(normalized: object, now: datetime) -> Url:",
                    "+    return Url(",
                    "+        id=None,",
                    "+        normalized=normalized,  # type: ignore[arg-type]",
                    "+        is_internal=True,",
                    "+        first_seen_at=now,",
                    "+        last_seen_at=now,",
                    "+    )",
                    "+",
                    "+",
                    " async def run_crawl(container: Container, command: CrawlCliInput) -> int:",
                    "     crawl = container.start_crawl().execute(",
                    "         StartCrawlInput(",
                    "-            base_url=command.base_url,",
                    "-            mode=command.mode,",
                    "-            trigger_source=command.trigger,",
                    "-            config={},",
                    "+            base_url=command.base_url, mode=command.mode,",
                    "+            trigger_source=command.trigger, config={},",
                    "         ),",
                    "     )",
                    "     if crawl.id is None:",
                    "         raise RuntimeError(\"start_crawl did not assign id\")",
                    "+    if command.seed_from_sitemap:",
                    "+        await _seed_from_sitemap(container, crawl_id=crawl.id, base_url=command.base_url)",
                    "     choice = await _build_fetcher(command)",
                    "     registered_domain = urlsplit(command.base_url).hostname or \"\"",
                    "     try:"
                ]
            },
            {
                "oldStart": 63,
                "oldLines": 24,
                "newStart": 100,
                "newLines": 16,
                "lines": [
                    "             render_mode=choice.render_mode,",
                    "         )",
                    "         worker = QueueWorker(",
                    "-            queue=container.queue,",
                    "-            process_url=process_uc,",
                    "-            errors=container.repos.errors,",
                    "-            clock=container.clock,",
                    "+            queue=container.queue, process_url=process_uc,",
                    "+            errors=container.repos.errors, clock=container.clock,",
                    "         )",
                    "-        processed = await worker.run(",
                    "-            WorkerConfig(",
                    "-                crawl_id=crawl.id,",
                    "-                concurrency=choice.concurrency,",
                    "-                max_urls=command.max_urls,",
                    "-            )",
                    "-        )",
                    "+        processed = await worker.run(WorkerConfig(",
                    "+            crawl_id=crawl.id, concurrency=choice.concurrency, max_urls=command.max_urls,",
                    "+        ))",
                    "     finally:",
                    "         await choice.close()",
                    "     container.finalize().execute(",
                    "-        crawl_id=crawl.id,",
                    "-        status=CrawlStatus.COMPLETED,",
                    "-        total_urls=processed,",
                    "+        crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,",
                    "     )",
                    "     return crawl.id",
                    " "
                ]
            }
        ],
        "originalFile": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import Awaitable, Callable\nfrom dataclasses import dataclass\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher\nfrom src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\n@dataclass(frozen=True, slots=True)\nclass CrawlCliInput:\n    base_url: str\n    mode: CrawlMode\n    trigger: TriggerSource\n    concurrency: int = HTTP_CONCURRENCY\n    max_urls: int | None = None\n\n\n@dataclass(frozen=True, slots=True)\nclass _FetcherChoice:\n    fetcher: HttpFetcherPort\n    render_mode: RenderMode\n    concurrency: int\n    close: Callable[[], Awaitable[None]]\n\n\nasync def _build_fetcher(command: CrawlCliInput) -> _FetcherChoice:\n    if command.mode is CrawlMode.FAST:\n        http = HttpxFetcher.create()\n        return _FetcherChoice(http, RenderMode.HTTP, command.concurrency, http.close)\n    browser = await PlaywrightFetcher.create()\n    adapter = PlaywrightHttpAdapter(browser)\n    concurrency = min(command.concurrency, PLAYWRIGHT_CONCURRENCY)\n    return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)\n\n\nasync def run_crawl(container: Container, command: CrawlCliInput) -> int:\n    crawl = container.start_crawl().execute(\n        StartCrawlInput(\n            base_url=command.base_url,\n            mode=command.mode,\n            trigger_source=command.trigger,\n            config={},\n        ),\n    )\n    if crawl.id is None:\n        raise RuntimeError(\"start_crawl did not assign id\")\n    choice = await _build_fetcher(command)\n    registered_domain = urlsplit(command.base_url).hostname or \"\"\n    try:\n        process_uc = container.process_url(\n            choice.fetcher,\n            registered_domain=registered_domain,\n            render_mode=choice.render_mode,\n        )\n        worker = QueueWorker(\n            queue=container.queue,\n            process_url=process_uc,\n            errors=container.repos.errors,\n            clock=container.clock,\n        )\n        processed = await worker.run(\n            WorkerConfig(\n                crawl_id=crawl.id,\n                concurrency=choice.concurrency,\n                max_urls=command.max_urls,\n            )\n        )\n    finally:\n        await choice.close()\n    container.finalize().execute(\n        crawl_id=crawl.id,\n        status=CrawlStatus.COMPLETED,\n        total_urls=processed,\n    )\n    return crawl.id\n\n\ndef sync_run_crawl(container: Container, command: CrawlCliInput) -> int:\n    return asyncio.run(run_crawl(container, command))\n",
        "userModified": false
    }
}
← Vorheriger Zur Liste Nächster →