{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/commands\/crawl.py",
"content": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import Awaitable, Callable\nfrom dataclasses import dataclass\nfrom datetime import datetime\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher\nfrom src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter\nfrom src.infrastructure.fetchers.sitemap_loader import SitemapLoader\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\n@dataclass(frozen=True, slots=True)\nclass CrawlCliInput:\n base_url: str\n mode: CrawlMode\n trigger: TriggerSource\n concurrency: int = HTTP_CONCURRENCY\n max_urls: int | None = None\n seed_from_sitemap: bool = True\n\n\n@dataclass(frozen=True, slots=True)\nclass _FetcherChoice:\n fetcher: HttpFetcherPort\n render_mode: RenderMode\n concurrency: int\n close: Callable[[], Awaitable[None]]\n\n\nasync def _build_fetcher(command: CrawlCliInput) -> _FetcherChoice:\n if command.mode is CrawlMode.FAST:\n http = HttpxFetcher.create()\n return _FetcherChoice(http, RenderMode.HTTP, command.concurrency, http.close)\n browser = await PlaywrightFetcher.create()\n adapter = PlaywrightHttpAdapter(browser)\n concurrency = min(command.concurrency, PLAYWRIGHT_CONCURRENCY)\n return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)\n\n\nasync def _seed_from_sitemap(container: Container, *, crawl_id: int, base_url: str) -> int:\n loader = SitemapLoader.create()\n try:\n discovered = await loader.discover(base_url)\n finally:\n await loader.close()\n registered = urlsplit(base_url).hostname or \"\"\n now = container.clock.now()\n enqueued = 0\n for raw in discovered:\n normalized = container.normalizer.normalize(raw)\n if not _is_internal(normalized.host, registered):\n continue\n container.repos.urls.upsert(_seed_url(normalized, now))\n if container.queue.enqueue(crawl_id, normalized):\n enqueued += 1\n return enqueued\n\n\ndef _is_internal(host: str, registered: str) -> bool:\n return host == registered or host.endswith(\".\" + registered)\n\n\ndef _seed_url(normalized: object, now: datetime) -> Url:\n return Url(\n id=None,\n normalized=normalized, # type: ignore[arg-type]\n is_internal=True,\n first_seen_at=now,\n last_seen_at=now,\n )\n\n\nasync def run_crawl(container: Container, command: CrawlCliInput) -> int:\n crawl = container.start_crawl().execute(\n StartCrawlInput(\n base_url=command.base_url, mode=command.mode,\n trigger_source=command.trigger, config={},\n ),\n )\n if crawl.id is None:\n raise RuntimeError(\"start_crawl did not assign id\")\n if command.seed_from_sitemap:\n await _seed_from_sitemap(container, crawl_id=crawl.id, base_url=command.base_url)\n choice = await _build_fetcher(command)\n registered_domain = urlsplit(command.base_url).hostname or \"\"\n try:\n process_uc = container.process_url(\n choice.fetcher,\n registered_domain=registered_domain,\n render_mode=choice.render_mode,\n )\n worker = QueueWorker(\n queue=container.queue, process_url=process_uc,\n errors=container.repos.errors, clock=container.clock,\n )\n processed = await worker.run(WorkerConfig(\n crawl_id=crawl.id, concurrency=choice.concurrency, max_urls=command.max_urls,\n ))\n finally:\n await choice.close()\n container.finalize().execute(\n crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n )\n return crawl.id\n\n\ndef sync_run_crawl(container: Container, command: CrawlCliInput) -> int:\n return asyncio.run(run_crawl(container, command))\n"
}
}
{
"tool_response": {
"type": "update",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/commands\/crawl.py",
"content": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import Awaitable, Callable\nfrom dataclasses import dataclass\nfrom datetime import datetime\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher\nfrom src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter\nfrom src.infrastructure.fetchers.sitemap_loader import SitemapLoader\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\n@dataclass(frozen=True, slots=True)\nclass CrawlCliInput:\n base_url: str\n mode: CrawlMode\n trigger: TriggerSource\n concurrency: int = HTTP_CONCURRENCY\n max_urls: int | None = None\n seed_from_sitemap: bool = True\n\n\n@dataclass(frozen=True, slots=True)\nclass _FetcherChoice:\n fetcher: HttpFetcherPort\n render_mode: RenderMode\n concurrency: int\n close: Callable[[], Awaitable[None]]\n\n\nasync def _build_fetcher(command: CrawlCliInput) -> _FetcherChoice:\n if command.mode is CrawlMode.FAST:\n http = HttpxFetcher.create()\n return _FetcherChoice(http, RenderMode.HTTP, command.concurrency, http.close)\n browser = await PlaywrightFetcher.create()\n adapter = PlaywrightHttpAdapter(browser)\n concurrency = min(command.concurrency, PLAYWRIGHT_CONCURRENCY)\n return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)\n\n\nasync def _seed_from_sitemap(container: Container, *, crawl_id: int, base_url: str) -> int:\n loader = SitemapLoader.create()\n try:\n discovered = await loader.discover(base_url)\n finally:\n await loader.close()\n registered = urlsplit(base_url).hostname or \"\"\n now = container.clock.now()\n enqueued = 0\n for raw in discovered:\n normalized = container.normalizer.normalize(raw)\n if not _is_internal(normalized.host, registered):\n continue\n container.repos.urls.upsert(_seed_url(normalized, now))\n if container.queue.enqueue(crawl_id, normalized):\n enqueued += 1\n return enqueued\n\n\ndef _is_internal(host: str, registered: str) -> bool:\n return host == registered or host.endswith(\".\" + registered)\n\n\ndef _seed_url(normalized: object, now: datetime) -> Url:\n return Url(\n id=None,\n normalized=normalized, # type: ignore[arg-type]\n is_internal=True,\n first_seen_at=now,\n last_seen_at=now,\n )\n\n\nasync def run_crawl(container: Container, command: CrawlCliInput) -> int:\n crawl = container.start_crawl().execute(\n StartCrawlInput(\n base_url=command.base_url, mode=command.mode,\n trigger_source=command.trigger, config={},\n ),\n )\n if crawl.id is None:\n raise RuntimeError(\"start_crawl did not assign id\")\n if command.seed_from_sitemap:\n await _seed_from_sitemap(container, crawl_id=crawl.id, base_url=command.base_url)\n choice = await _build_fetcher(command)\n registered_domain = urlsplit(command.base_url).hostname or \"\"\n try:\n process_uc = container.process_url(\n choice.fetcher,\n registered_domain=registered_domain,\n render_mode=choice.render_mode,\n )\n worker = QueueWorker(\n queue=container.queue, process_url=process_uc,\n errors=container.repos.errors, clock=container.clock,\n )\n processed = await worker.run(WorkerConfig(\n crawl_id=crawl.id, concurrency=choice.concurrency, max_urls=command.max_urls,\n ))\n finally:\n await choice.close()\n container.finalize().execute(\n crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n )\n return crawl.id\n\n\ndef sync_run_crawl(container: Container, command: CrawlCliInput) -> int:\n return asyncio.run(run_crawl(container, command))\n",
"structuredPatch": [
{
"oldStart": 3,
"oldLines": 15,
"newStart": 3,
"newLines": 18,
"lines": [
" import asyncio",
" from collections.abc import Awaitable, Callable",
" from dataclasses import dataclass",
"+from datetime import datetime",
" from urllib.parse import urlsplit",
" ",
" from src.application.use_cases.start_crawl import StartCrawlInput",
"+from src.domain.entities.url import Url",
" from src.domain.ports.http_fetcher_port import HttpFetcherPort",
" from src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource",
" from src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY",
" from src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher",
" from src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher",
" from src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter",
"+from src.infrastructure.fetchers.sitemap_loader import SitemapLoader",
" from src.interfaces.cli.wiring import Container",
" from src.interfaces.cli.worker import QueueWorker, WorkerConfig",
" "
]
},
{
"oldStart": 23,
"oldLines": 6,
"newStart": 26,
"newLines": 7,
"lines": [
" trigger: TriggerSource",
" concurrency: int = HTTP_CONCURRENCY",
" max_urls: int | None = None",
"+ seed_from_sitemap: bool = True",
" ",
" ",
" @dataclass(frozen=True, slots=True)"
]
},
{
"oldStart": 43,
"oldLines": 17,
"newStart": 47,
"newLines": 50,
"lines": [
" return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)",
" ",
" ",
"+async def _seed_from_sitemap(container: Container, *, crawl_id: int, base_url: str) -> int:",
"+ loader = SitemapLoader.create()",
"+ try:",
"+ discovered = await loader.discover(base_url)",
"+ finally:",
"+ await loader.close()",
"+ registered = urlsplit(base_url).hostname or \"\"",
"+ now = container.clock.now()",
"+ enqueued = 0",
"+ for raw in discovered:",
"+ normalized = container.normalizer.normalize(raw)",
"+ if not _is_internal(normalized.host, registered):",
"+ continue",
"+ container.repos.urls.upsert(_seed_url(normalized, now))",
"+ if container.queue.enqueue(crawl_id, normalized):",
"+ enqueued += 1",
"+ return enqueued",
"+",
"+",
"+def _is_internal(host: str, registered: str) -> bool:",
"+ return host == registered or host.endswith(\".\" + registered)",
"+",
"+",
"+def _seed_url(normalized: object, now: datetime) -> Url:",
"+ return Url(",
"+ id=None,",
"+ normalized=normalized, # type: ignore[arg-type]",
"+ is_internal=True,",
"+ first_seen_at=now,",
"+ last_seen_at=now,",
"+ )",
"+",
"+",
" async def run_crawl(container: Container, command: CrawlCliInput) -> int:",
" crawl = container.start_crawl().execute(",
" StartCrawlInput(",
"- base_url=command.base_url,",
"- mode=command.mode,",
"- trigger_source=command.trigger,",
"- config={},",
"+ base_url=command.base_url, mode=command.mode,",
"+ trigger_source=command.trigger, config={},",
" ),",
" )",
" if crawl.id is None:",
" raise RuntimeError(\"start_crawl did not assign id\")",
"+ if command.seed_from_sitemap:",
"+ await _seed_from_sitemap(container, crawl_id=crawl.id, base_url=command.base_url)",
" choice = await _build_fetcher(command)",
" registered_domain = urlsplit(command.base_url).hostname or \"\"",
" try:"
]
},
{
"oldStart": 63,
"oldLines": 24,
"newStart": 100,
"newLines": 16,
"lines": [
" render_mode=choice.render_mode,",
" )",
" worker = QueueWorker(",
"- queue=container.queue,",
"- process_url=process_uc,",
"- errors=container.repos.errors,",
"- clock=container.clock,",
"+ queue=container.queue, process_url=process_uc,",
"+ errors=container.repos.errors, clock=container.clock,",
" )",
"- processed = await worker.run(",
"- WorkerConfig(",
"- crawl_id=crawl.id,",
"- concurrency=choice.concurrency,",
"- max_urls=command.max_urls,",
"- )",
"- )",
"+ processed = await worker.run(WorkerConfig(",
"+ crawl_id=crawl.id, concurrency=choice.concurrency, max_urls=command.max_urls,",
"+ ))",
" finally:",
" await choice.close()",
" container.finalize().execute(",
"- crawl_id=crawl.id,",
"- status=CrawlStatus.COMPLETED,",
"- total_urls=processed,",
"+ crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,",
" )",
" return crawl.id",
" "
]
}
],
"originalFile": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import Awaitable, Callable\nfrom dataclasses import dataclass\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, RenderMode, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY, PLAYWRIGHT_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.infrastructure.fetchers.playwright_fetcher import PlaywrightFetcher\nfrom src.infrastructure.fetchers.playwright_http_adapter import PlaywrightHttpAdapter\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\n@dataclass(frozen=True, slots=True)\nclass CrawlCliInput:\n base_url: str\n mode: CrawlMode\n trigger: TriggerSource\n concurrency: int = HTTP_CONCURRENCY\n max_urls: int | None = None\n\n\n@dataclass(frozen=True, slots=True)\nclass _FetcherChoice:\n fetcher: HttpFetcherPort\n render_mode: RenderMode\n concurrency: int\n close: Callable[[], Awaitable[None]]\n\n\nasync def _build_fetcher(command: CrawlCliInput) -> _FetcherChoice:\n if command.mode is CrawlMode.FAST:\n http = HttpxFetcher.create()\n return _FetcherChoice(http, RenderMode.HTTP, command.concurrency, http.close)\n browser = await PlaywrightFetcher.create()\n adapter = PlaywrightHttpAdapter(browser)\n concurrency = min(command.concurrency, PLAYWRIGHT_CONCURRENCY)\n return _FetcherChoice(adapter, RenderMode.PLAYWRIGHT, concurrency, browser.close)\n\n\nasync def run_crawl(container: Container, command: CrawlCliInput) -> int:\n crawl = container.start_crawl().execute(\n StartCrawlInput(\n base_url=command.base_url,\n mode=command.mode,\n trigger_source=command.trigger,\n config={},\n ),\n )\n if crawl.id is None:\n raise RuntimeError(\"start_crawl did not assign id\")\n choice = await _build_fetcher(command)\n registered_domain = urlsplit(command.base_url).hostname or \"\"\n try:\n process_uc = container.process_url(\n choice.fetcher,\n registered_domain=registered_domain,\n render_mode=choice.render_mode,\n )\n worker = QueueWorker(\n queue=container.queue,\n process_url=process_uc,\n errors=container.repos.errors,\n clock=container.clock,\n )\n processed = await worker.run(\n WorkerConfig(\n crawl_id=crawl.id,\n concurrency=choice.concurrency,\n max_urls=command.max_urls,\n )\n )\n finally:\n await choice.close()\n container.finalize().execute(\n crawl_id=crawl.id,\n status=CrawlStatus.COMPLETED,\n total_urls=processed,\n )\n return crawl.id\n\n\ndef sync_run_crawl(container: Container, command: CrawlCliInput) -> int:\n return asyncio.run(run_crawl(container, command))\n",
"userModified": false
}
}