Protokoll #33521

ID33521
Zeitstempel2026-04-21 02:36:52.308876
Clientroot
IP62.246.34.195
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens3,105 (Input: 1,547, Output: 1,558)
Dauer74 ms
Request-Zeit2026-04-21 02:36:52.308876
Response-Zeit2026-04-21 02:36:52.382481

Request

{
    "event": "PreToolUse",
    "tool_name": "Write",
    "tool_input": {
        "file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/process_url.py",
        "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\n\nfrom src.application.use_cases.extract_seo_data import ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n    fetcher: HttpFetcherPort\n    html_parser: HtmlParserPort\n    sd_parser: StructuredDataParserPort\n    pages: PageRepositoryPort\n    urls: UrlRepositoryPort\n    links: LinkRepositoryPort\n    images: ImageRepositoryPort\n    structured: StructuredDataRepositoryPort\n    hreflang: HreflangRepositoryPort\n    queue: UrlQueuePort\n    classifier: LinkClassifier\n    normalizer: UrlNormalizer\n    extractor: ExtractSeoDataUseCase\n    clock: ClockPort\n\n\nclass ProcessUrlUseCase:\n    def __init__(self, deps: ProcessUrlDeps) -> None:\n        self._d = deps\n\n    async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n        now = self._d.clock.now()\n        fetch = await self._d.fetcher.fetch(url)\n        parsed = self._d.html_parser.parse(fetch.body.decode(\"utf-8\", errors=\"replace\"),\n                                           base_url=url.url)\n        url_entity = self._upsert_url(url, now=now, is_internal=True)\n        extracted = self._d.extractor.execute(\n            crawl_id=crawl_id, url_id=_id_of(url_entity), fetched_at=now,\n            render_mode=RenderMode.HTTP, fetch=fetch, parsed=parsed, page_url=url.url,\n        )\n        page = self._d.pages.save(extracted.page)\n        self._persist_links(page_id=_id_of(page), source_url_id=_id_of(url_entity),\n                            parsed_internal=parsed.internal_links,\n                            parsed_external=parsed.external_links,\n                            crawl_id=crawl_id, now=now)\n        self._persist_images(page_id=_id_of(page), parsed=parsed)\n        self._persist_hreflang(page_id=_id_of(page), parsed=parsed)\n        self._persist_structured_data(page_id=_id_of(page), html=fetch.body, base_url=url.url)\n        self._d.queue.mark_done(crawl_id, url)\n        return page\n\n    def _upsert_url(self, normalized: NormalizedUrl, *, now: object, is_internal: bool) -> Url:\n        return self._d.urls.upsert(Url(\n            id=None, normalized=normalized, is_internal=is_internal,\n            first_seen_at=now, last_seen_at=now,  # type: ignore[arg-type]\n        ))\n\n    def _persist_links(self, *, page_id: int, source_url_id: int,\n                       parsed_internal: tuple[str, ...], parsed_external: tuple[str, ...],\n                       crawl_id: int, now: object) -> None:\n        links: list[Link] = []\n        for raw in parsed_internal:\n            normalized = self._d.normalizer.normalize(raw)\n            target = self._upsert_url(normalized, now=now, is_internal=True)\n            links.append(_make_link(page_id, source_url_id, normalized.url,\n                                    target_url_id=_id_of(target), is_internal=True))\n            self._d.queue.enqueue(crawl_id, normalized)\n        for raw in parsed_external:\n            normalized = self._d.normalizer.normalize(raw)\n            links.append(_make_link(page_id, source_url_id, normalized.url,\n                                    target_url_id=None, is_internal=False))\n        self._d.links.save_many(links)\n\n    def _persist_images(self, *, page_id: int, parsed: object) -> None:\n        specs = getattr(parsed, \"image_specs\")\n        imgs = [Image(id=None, page_id=page_id, src=s, alt=a, title=t,\n                      width=w, height=h, loading_attr=None, decoding_attr=None,\n                      http_status=None, content_length=None)\n                for s, a, t, w, h in specs]\n        self._d.images.save_many(imgs)\n\n    def _persist_hreflang(self, *, page_id: int, parsed: object) -> None:\n        self._d.hreflang.save_many(page_id, getattr(parsed, \"hreflang\"))\n\n    def _persist_structured_data(self, *, page_id: int, html: bytes, base_url: str) -> None:\n        items = self._d.sd_parser.parse(html.decode(\"utf-8\", errors=\"replace\"), base_url=base_url)\n        sd = [StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n              for f, t, p in items]\n        self._d.structured.save_many(sd)\n\n\ndef _id_of(entity: object) -> int:\n    v = getattr(entity, \"id\")\n    assert isinstance(v, int)\n    return v\n\n\ndef _make_link(page_id: int, source_url_id: int, target_url: str,\n               *, target_url_id: int | None, is_internal: bool) -> Link:\n    return Link(id=None, page_id=page_id, source_url_id=source_url_id, target_url=target_url,\n                target_url_id=target_url_id, anchor_text=None, rel=frozenset(),\n                is_internal=is_internal, is_external=not is_internal, position_hint=None)\n"
    }
}

Response

{
    "tool_response": {
        "type": "create",
        "filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/process_url.py",
        "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\n\nfrom src.application.use_cases.extract_seo_data import ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n    fetcher: HttpFetcherPort\n    html_parser: HtmlParserPort\n    sd_parser: StructuredDataParserPort\n    pages: PageRepositoryPort\n    urls: UrlRepositoryPort\n    links: LinkRepositoryPort\n    images: ImageRepositoryPort\n    structured: StructuredDataRepositoryPort\n    hreflang: HreflangRepositoryPort\n    queue: UrlQueuePort\n    classifier: LinkClassifier\n    normalizer: UrlNormalizer\n    extractor: ExtractSeoDataUseCase\n    clock: ClockPort\n\n\nclass ProcessUrlUseCase:\n    def __init__(self, deps: ProcessUrlDeps) -> None:\n        self._d = deps\n\n    async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n        now = self._d.clock.now()\n        fetch = await self._d.fetcher.fetch(url)\n        parsed = self._d.html_parser.parse(fetch.body.decode(\"utf-8\", errors=\"replace\"),\n                                           base_url=url.url)\n        url_entity = self._upsert_url(url, now=now, is_internal=True)\n        extracted = self._d.extractor.execute(\n            crawl_id=crawl_id, url_id=_id_of(url_entity), fetched_at=now,\n            render_mode=RenderMode.HTTP, fetch=fetch, parsed=parsed, page_url=url.url,\n        )\n        page = self._d.pages.save(extracted.page)\n        self._persist_links(page_id=_id_of(page), source_url_id=_id_of(url_entity),\n                            parsed_internal=parsed.internal_links,\n                            parsed_external=parsed.external_links,\n                            crawl_id=crawl_id, now=now)\n        self._persist_images(page_id=_id_of(page), parsed=parsed)\n        self._persist_hreflang(page_id=_id_of(page), parsed=parsed)\n        self._persist_structured_data(page_id=_id_of(page), html=fetch.body, base_url=url.url)\n        self._d.queue.mark_done(crawl_id, url)\n        return page\n\n    def _upsert_url(self, normalized: NormalizedUrl, *, now: object, is_internal: bool) -> Url:\n        return self._d.urls.upsert(Url(\n            id=None, normalized=normalized, is_internal=is_internal,\n            first_seen_at=now, last_seen_at=now,  # type: ignore[arg-type]\n        ))\n\n    def _persist_links(self, *, page_id: int, source_url_id: int,\n                       parsed_internal: tuple[str, ...], parsed_external: tuple[str, ...],\n                       crawl_id: int, now: object) -> None:\n        links: list[Link] = []\n        for raw in parsed_internal:\n            normalized = self._d.normalizer.normalize(raw)\n            target = self._upsert_url(normalized, now=now, is_internal=True)\n            links.append(_make_link(page_id, source_url_id, normalized.url,\n                                    target_url_id=_id_of(target), is_internal=True))\n            self._d.queue.enqueue(crawl_id, normalized)\n        for raw in parsed_external:\n            normalized = self._d.normalizer.normalize(raw)\n            links.append(_make_link(page_id, source_url_id, normalized.url,\n                                    target_url_id=None, is_internal=False))\n        self._d.links.save_many(links)\n\n    def _persist_images(self, *, page_id: int, parsed: object) -> None:\n        specs = getattr(parsed, \"image_specs\")\n        imgs = [Image(id=None, page_id=page_id, src=s, alt=a, title=t,\n                      width=w, height=h, loading_attr=None, decoding_attr=None,\n                      http_status=None, content_length=None)\n                for s, a, t, w, h in specs]\n        self._d.images.save_many(imgs)\n\n    def _persist_hreflang(self, *, page_id: int, parsed: object) -> None:\n        self._d.hreflang.save_many(page_id, getattr(parsed, \"hreflang\"))\n\n    def _persist_structured_data(self, *, page_id: int, html: bytes, base_url: str) -> None:\n        items = self._d.sd_parser.parse(html.decode(\"utf-8\", errors=\"replace\"), base_url=base_url)\n        sd = [StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n              for f, t, p in items]\n        self._d.structured.save_many(sd)\n\n\ndef _id_of(entity: object) -> int:\n    v = getattr(entity, \"id\")\n    assert isinstance(v, int)\n    return v\n\n\ndef _make_link(page_id: int, source_url_id: int, target_url: str,\n               *, target_url_id: int | None, is_internal: bool) -> Link:\n    return Link(id=None, page_id=page_id, source_url_id=source_url_id, target_url=target_url,\n                target_url_id=target_url_id, anchor_text=None, rel=frozenset(),\n                is_internal=is_internal, is_external=not is_internal, position_hint=None)\n",
        "structuredPatch": [],
        "originalFile": null,
        "userModified": false
    }
}
← Vorheriger Zur Liste Nächster →