{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/commands\/crawl.py",
"content": "from __future__ import annotations\n\nimport asyncio\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\nasync def run_crawl(\n container: Container,\n *,\n base_url: str,\n mode: CrawlMode,\n trigger: TriggerSource,\n concurrency: int = HTTP_CONCURRENCY,\n max_urls: int | None = None,\n) -> int:\n start_uc = container.start_crawl()\n crawl = start_uc.execute(\n StartCrawlInput(\n base_url=base_url, mode=mode, trigger_source=trigger, config={},\n ),\n )\n assert crawl.id is not None, \"start_crawl must assign id\"\n fetcher = HttpxFetcher.create()\n registered_domain = urlsplit(base_url).hostname or \"\"\n try:\n process_uc = container.process_url(fetcher, registered_domain=registered_domain)\n worker = QueueWorker(\n queue=container.queue, process_url=process_uc,\n errors=container.repos.errors, clock=container.clock,\n )\n processed = await worker.run(WorkerConfig(\n crawl_id=crawl.id, concurrency=concurrency, max_urls=max_urls,\n ))\n finally:\n await fetcher.close()\n container.finalize().execute(\n crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n )\n return crawl.id\n\n\ndef sync_run_crawl(\n container: Container,\n *,\n base_url: str,\n mode: CrawlMode,\n trigger: TriggerSource,\n concurrency: int = HTTP_CONCURRENCY,\n max_urls: int | None = None,\n) -> int:\n return asyncio.run(run_crawl(\n container, base_url=base_url, mode=mode, trigger=trigger,\n concurrency=concurrency, max_urls=max_urls,\n ))\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/commands\/crawl.py",
"content": "from __future__ import annotations\n\nimport asyncio\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\nasync def run_crawl(\n container: Container,\n *,\n base_url: str,\n mode: CrawlMode,\n trigger: TriggerSource,\n concurrency: int = HTTP_CONCURRENCY,\n max_urls: int | None = None,\n) -> int:\n start_uc = container.start_crawl()\n crawl = start_uc.execute(\n StartCrawlInput(\n base_url=base_url, mode=mode, trigger_source=trigger, config={},\n ),\n )\n assert crawl.id is not None, \"start_crawl must assign id\"\n fetcher = HttpxFetcher.create()\n registered_domain = urlsplit(base_url).hostname or \"\"\n try:\n process_uc = container.process_url(fetcher, registered_domain=registered_domain)\n worker = QueueWorker(\n queue=container.queue, process_url=process_uc,\n errors=container.repos.errors, clock=container.clock,\n )\n processed = await worker.run(WorkerConfig(\n crawl_id=crawl.id, concurrency=concurrency, max_urls=max_urls,\n ))\n finally:\n await fetcher.close()\n container.finalize().execute(\n crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n )\n return crawl.id\n\n\ndef sync_run_crawl(\n container: Container,\n *,\n base_url: str,\n mode: CrawlMode,\n trigger: TriggerSource,\n concurrency: int = HTTP_CONCURRENCY,\n max_urls: int | None = None,\n) -> int:\n return asyncio.run(run_crawl(\n container, base_url=base_url, mode=mode, trigger=trigger,\n concurrency=concurrency, max_urls=max_urls,\n ))\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}