{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/compare_crawls.py",
"content": "from __future__ import annotations\n\nfrom src.application.read_models.crawl_diff import CrawlDiff\nfrom src.domain.entities.page import Page\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\n\nPAGE_FETCH_CHUNK = 500\n\n\nclass CompareCrawlsUseCase:\n def __init__(\n self,\n *,\n pages: PageRepositoryPort,\n urls: UrlRepositoryPort,\n ) -> None:\n self._pages = pages\n self._urls = urls\n\n def execute(self, *, from_crawl_id: int, to_crawl_id: int) -> CrawlDiff:\n from_pages = self._index_by_url(from_crawl_id)\n to_pages = self._index_by_url(to_crawl_id)\n added = tuple(sorted(set(to_pages) - set(from_pages)))\n removed = tuple(sorted(set(from_pages) - set(to_pages)))\n status_changed = self._status_changes(from_pages, to_pages)\n title_changed = self._title_changes(from_pages, to_pages)\n return CrawlDiff(\n from_crawl_id=from_crawl_id,\n to_crawl_id=to_crawl_id,\n added_urls=added,\n removed_urls=removed,\n status_changed=status_changed,\n title_changed=title_changed,\n )\n\n def _index_by_url(self, crawl_id: int) -> dict[str, Page]:\n index: dict[str, Page] = {}\n offset = 0\n while True:\n batch = self._pages.list_by_crawl(crawl_id, limit=PAGE_FETCH_CHUNK, offset=offset)\n if not batch:\n return index\n for page in batch:\n index[_url_key(page, self._urls)] = page\n offset += len(batch)\n\n def _status_changes(\n self, frm: dict[str, Page], to: dict[str, Page],\n ) -> tuple[tuple[str, int, int], ...]:\n changes: list[tuple[str, int, int]] = []\n for url, to_page in to.items():\n old = frm.get(url)\n if old is None:\n continue\n s_old = old.http.status.code if old.http.status else 0\n s_new = to_page.http.status.code if to_page.http.status else 0\n if s_old != s_new:\n changes.append((url, s_old, s_new))\n return tuple(sorted(changes))\n\n def _title_changes(\n self, frm: dict[str, Page], to: dict[str, Page],\n ) -> tuple[tuple[str, str | None, str | None], ...]:\n changes: list[tuple[str, str | None, str | None]] = []\n for url, to_page in to.items():\n old = frm.get(url)\n if old is None:\n continue\n if old.head.title != to_page.head.title:\n changes.append((url, old.head.title, to_page.head.title))\n return tuple(sorted(changes, key=lambda t: t[0]))\n\n\ndef _url_key(page: Page, urls: UrlRepositoryPort) -> str:\n _ = urls\n return str(page.url_id)\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/compare_crawls.py",
"content": "from __future__ import annotations\n\nfrom src.application.read_models.crawl_diff import CrawlDiff\nfrom src.domain.entities.page import Page\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\n\nPAGE_FETCH_CHUNK = 500\n\n\nclass CompareCrawlsUseCase:\n def __init__(\n self,\n *,\n pages: PageRepositoryPort,\n urls: UrlRepositoryPort,\n ) -> None:\n self._pages = pages\n self._urls = urls\n\n def execute(self, *, from_crawl_id: int, to_crawl_id: int) -> CrawlDiff:\n from_pages = self._index_by_url(from_crawl_id)\n to_pages = self._index_by_url(to_crawl_id)\n added = tuple(sorted(set(to_pages) - set(from_pages)))\n removed = tuple(sorted(set(from_pages) - set(to_pages)))\n status_changed = self._status_changes(from_pages, to_pages)\n title_changed = self._title_changes(from_pages, to_pages)\n return CrawlDiff(\n from_crawl_id=from_crawl_id,\n to_crawl_id=to_crawl_id,\n added_urls=added,\n removed_urls=removed,\n status_changed=status_changed,\n title_changed=title_changed,\n )\n\n def _index_by_url(self, crawl_id: int) -> dict[str, Page]:\n index: dict[str, Page] = {}\n offset = 0\n while True:\n batch = self._pages.list_by_crawl(crawl_id, limit=PAGE_FETCH_CHUNK, offset=offset)\n if not batch:\n return index\n for page in batch:\n index[_url_key(page, self._urls)] = page\n offset += len(batch)\n\n def _status_changes(\n self, frm: dict[str, Page], to: dict[str, Page],\n ) -> tuple[tuple[str, int, int], ...]:\n changes: list[tuple[str, int, int]] = []\n for url, to_page in to.items():\n old = frm.get(url)\n if old is None:\n continue\n s_old = old.http.status.code if old.http.status else 0\n s_new = to_page.http.status.code if to_page.http.status else 0\n if s_old != s_new:\n changes.append((url, s_old, s_new))\n return tuple(sorted(changes))\n\n def _title_changes(\n self, frm: dict[str, Page], to: dict[str, Page],\n ) -> tuple[tuple[str, str | None, str | None], ...]:\n changes: list[tuple[str, str | None, str | None]] = []\n for url, to_page in to.items():\n old = frm.get(url)\n if old is None:\n continue\n if old.head.title != to_page.head.title:\n changes.append((url, old.head.title, to_page.head.title))\n return tuple(sorted(changes, key=lambda t: t[0]))\n\n\ndef _url_key(page: Page, urls: UrlRepositoryPort) -> str:\n _ = urls\n return str(page.url_id)\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}