{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/generate_crawl_report.py",
"content": "from __future__ import annotations\n\nfrom collections import Counter\n\nfrom src.application.read_models.crawl_report import CrawlReport\nfrom src.domain.entities.page import Page\nfrom src.domain.ports.crawl_error_repository_port import CrawlErrorRepositoryPort\nfrom src.domain.ports.crawl_repository_port import CrawlRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\n\nPAGE_FETCH_CHUNK = 500\n\n\nclass GenerateCrawlReportUseCase:\n def __init__(\n self,\n *,\n crawls: CrawlRepositoryPort,\n pages: PageRepositoryPort,\n errors: CrawlErrorRepositoryPort,\n ) -> None:\n self._crawls = crawls\n self._pages = pages\n self._errors = errors\n\n def execute(self, *, crawl_id: int) -> CrawlReport:\n crawl = self._crawls.get(crawl_id)\n if crawl is None:\n raise LookupError(f\"crawl {crawl_id} not found\")\n pages = list(self._all_pages(crawl_id))\n errors = self._errors.list_by_crawl(crawl_id)\n status_hist = Counter(p.http.status.code for p in pages if p.http.status is not None)\n flag_hist = Counter(f.key.value for p in pages for f in p.quality_flags)\n top_errors = tuple(Counter(e.error_type for e in errors).most_common(10))\n assert crawl.id is not None\n return CrawlReport(\n crawl_id=crawl.id, base_url=crawl.base_url, mode=crawl.mode, status=crawl.status,\n started_at=crawl.started_at, finished_at=crawl.finished_at,\n total_urls=crawl.total_urls, total_errors=crawl.total_errors,\n pages_by_status=dict(status_hist), flag_counts=dict(flag_hist),\n top_errors=top_errors,\n )\n\n def _all_pages(self, crawl_id: int) -> list[Page]:\n collected: list[Page] = []\n offset = 0\n while True:\n batch = self._pages.list_by_crawl(crawl_id, limit=PAGE_FETCH_CHUNK, offset=offset)\n if not batch:\n return collected\n collected.extend(batch)\n offset += len(batch)\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/generate_crawl_report.py",
"content": "from __future__ import annotations\n\nfrom collections import Counter\n\nfrom src.application.read_models.crawl_report import CrawlReport\nfrom src.domain.entities.page import Page\nfrom src.domain.ports.crawl_error_repository_port import CrawlErrorRepositoryPort\nfrom src.domain.ports.crawl_repository_port import CrawlRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\n\nPAGE_FETCH_CHUNK = 500\n\n\nclass GenerateCrawlReportUseCase:\n def __init__(\n self,\n *,\n crawls: CrawlRepositoryPort,\n pages: PageRepositoryPort,\n errors: CrawlErrorRepositoryPort,\n ) -> None:\n self._crawls = crawls\n self._pages = pages\n self._errors = errors\n\n def execute(self, *, crawl_id: int) -> CrawlReport:\n crawl = self._crawls.get(crawl_id)\n if crawl is None:\n raise LookupError(f\"crawl {crawl_id} not found\")\n pages = list(self._all_pages(crawl_id))\n errors = self._errors.list_by_crawl(crawl_id)\n status_hist = Counter(p.http.status.code for p in pages if p.http.status is not None)\n flag_hist = Counter(f.key.value for p in pages for f in p.quality_flags)\n top_errors = tuple(Counter(e.error_type for e in errors).most_common(10))\n assert crawl.id is not None\n return CrawlReport(\n crawl_id=crawl.id, base_url=crawl.base_url, mode=crawl.mode, status=crawl.status,\n started_at=crawl.started_at, finished_at=crawl.finished_at,\n total_urls=crawl.total_urls, total_errors=crawl.total_errors,\n pages_by_status=dict(status_hist), flag_counts=dict(flag_hist),\n top_errors=top_errors,\n )\n\n def _all_pages(self, crawl_id: int) -> list[Page]:\n collected: list[Page] = []\n offset = 0\n while True:\n batch = self._pages.list_by_crawl(crawl_id, limit=PAGE_FETCH_CHUNK, offset=offset)\n if not batch:\n return collected\n collected.extend(batch)\n offset += len(batch)\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}