{
"tool_response": {
"type": "update",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/commands\/crawl.py",
"content": "from __future__ import annotations\n\nimport asyncio\nfrom dataclasses import dataclass\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\n@dataclass(frozen=True, slots=True)\nclass CrawlCliInput:\n base_url: str\n mode: CrawlMode\n trigger: TriggerSource\n concurrency: int = HTTP_CONCURRENCY\n max_urls: int | None = None\n\n\nasync def run_crawl(container: Container, command: CrawlCliInput) -> int:\n crawl = container.start_crawl().execute(\n StartCrawlInput(\n base_url=command.base_url, mode=command.mode,\n trigger_source=command.trigger, config={},\n ),\n )\n if crawl.id is None:\n raise RuntimeError(\"start_crawl did not assign id\")\n fetcher = HttpxFetcher.create()\n registered_domain = urlsplit(command.base_url).hostname or \"\"\n try:\n process_uc = container.process_url(fetcher, registered_domain=registered_domain)\n worker = QueueWorker(\n queue=container.queue, process_url=process_uc,\n errors=container.repos.errors, clock=container.clock,\n )\n processed = await worker.run(WorkerConfig(\n crawl_id=crawl.id, concurrency=command.concurrency, max_urls=command.max_urls,\n ))\n finally:\n await fetcher.close()\n container.finalize().execute(\n crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n )\n return crawl.id\n\n\ndef sync_run_crawl(container: Container, command: CrawlCliInput) -> int:\n return asyncio.run(run_crawl(container, command))\n",
"structuredPatch": [
{
"oldStart": 1,
"oldLines": 6,
"newStart": 1,
"newLines": 7,
"lines": [
" from __future__ import annotations",
" ",
" import asyncio",
"+from dataclasses import dataclass",
" from urllib.parse import urlsplit",
" ",
" from src.application.use_cases.start_crawl import StartCrawlInput"
]
},
{
"oldStart": 11,
"oldLines": 24,
"newStart": 12,
"newLines": 26,
"lines": [
" from src.interfaces.cli.worker import QueueWorker, WorkerConfig",
" ",
" ",
"-async def run_crawl(",
"- container: Container,",
"- *,",
"- base_url: str,",
"- mode: CrawlMode,",
"- trigger: TriggerSource,",
"- concurrency: int = HTTP_CONCURRENCY,",
"- max_urls: int | None = None,",
"-) -> int:",
"- start_uc = container.start_crawl()",
"- crawl = start_uc.execute(",
"+@dataclass(frozen=True, slots=True)",
"+class CrawlCliInput:",
"+ base_url: str",
"+ mode: CrawlMode",
"+ trigger: TriggerSource",
"+ concurrency: int = HTTP_CONCURRENCY",
"+ max_urls: int | None = None",
"+",
"+",
"+async def run_crawl(container: Container, command: CrawlCliInput) -> int:",
"+ crawl = container.start_crawl().execute(",
" StartCrawlInput(",
"- base_url=base_url, mode=mode, trigger_source=trigger, config={},",
"+ base_url=command.base_url, mode=command.mode,",
"+ trigger_source=command.trigger, config={},",
" ),",
" )",
"- assert crawl.id is not None, \"start_crawl must assign id\"",
"+ if crawl.id is None:",
"+ raise RuntimeError(\"start_crawl did not assign id\")",
" fetcher = HttpxFetcher.create()",
"- registered_domain = urlsplit(base_url).hostname or \"\"",
"+ registered_domain = urlsplit(command.base_url).hostname or \"\"",
" try:",
" process_uc = container.process_url(fetcher, registered_domain=registered_domain)",
" worker = QueueWorker("
]
},
{
"oldStart": 36,
"oldLines": 7,
"newStart": 39,
"newLines": 7,
"lines": [
" errors=container.repos.errors, clock=container.clock,",
" )",
" processed = await worker.run(WorkerConfig(",
"- crawl_id=crawl.id, concurrency=concurrency, max_urls=max_urls,",
"+ crawl_id=crawl.id, concurrency=command.concurrency, max_urls=command.max_urls,",
" ))",
" finally:",
" await fetcher.close()"
]
},
{
"oldStart": 46,
"oldLines": 16,
"newStart": 49,
"newLines": 5,
"lines": [
" return crawl.id",
" ",
" ",
"-def sync_run_crawl(",
"- container: Container,",
"- *,",
"- base_url: str,",
"- mode: CrawlMode,",
"- trigger: TriggerSource,",
"- concurrency: int = HTTP_CONCURRENCY,",
"- max_urls: int | None = None,",
"-) -> int:",
"- return asyncio.run(run_crawl(",
"- container, base_url=base_url, mode=mode, trigger=trigger,",
"- concurrency=concurrency, max_urls=max_urls,",
"- ))",
"+def sync_run_crawl(container: Container, command: CrawlCliInput) -> int:",
"+ return asyncio.run(run_crawl(container, command))"
]
}
],
"originalFile": "from __future__ import annotations\n\nimport asyncio\nfrom urllib.parse import urlsplit\n\nfrom src.application.use_cases.start_crawl import StartCrawlInput\nfrom src.domain.value_objects.enums import CrawlMode, CrawlStatus, TriggerSource\nfrom src.infrastructure.config.constants import HTTP_CONCURRENCY\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.interfaces.cli.wiring import Container\nfrom src.interfaces.cli.worker import QueueWorker, WorkerConfig\n\n\nasync def run_crawl(\n container: Container,\n *,\n base_url: str,\n mode: CrawlMode,\n trigger: TriggerSource,\n concurrency: int = HTTP_CONCURRENCY,\n max_urls: int | None = None,\n) -> int:\n start_uc = container.start_crawl()\n crawl = start_uc.execute(\n StartCrawlInput(\n base_url=base_url, mode=mode, trigger_source=trigger, config={},\n ),\n )\n assert crawl.id is not None, \"start_crawl must assign id\"\n fetcher = HttpxFetcher.create()\n registered_domain = urlsplit(base_url).hostname or \"\"\n try:\n process_uc = container.process_url(fetcher, registered_domain=registered_domain)\n worker = QueueWorker(\n queue=container.queue, process_url=process_uc,\n errors=container.repos.errors, clock=container.clock,\n )\n processed = await worker.run(WorkerConfig(\n crawl_id=crawl.id, concurrency=concurrency, max_urls=max_urls,\n ))\n finally:\n await fetcher.close()\n container.finalize().execute(\n crawl_id=crawl.id, status=CrawlStatus.COMPLETED, total_urls=processed,\n )\n return crawl.id\n\n\ndef sync_run_crawl(\n container: Container,\n *,\n base_url: str,\n mode: CrawlMode,\n trigger: TriggerSource,\n concurrency: int = HTTP_CONCURRENCY,\n max_urls: int | None = None,\n) -> int:\n return asyncio.run(run_crawl(\n container, base_url=base_url, mode=mode, trigger=trigger,\n concurrency=concurrency, max_urls=max_urls,\n ))\n",
"userModified": false
}
}