Protokoll #33548

ID33548
Zeitstempel2026-04-21 02:41:36.638674
Clientroot
IP62.246.34.195
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens6,537 (Input: 1,440, Output: 5,097)
Dauer61 ms
Request-Zeit2026-04-21 02:41:36.638674
Response-Zeit2026-04-21 02:41:36.699624

Request

{
    "event": "PreToolUse",
    "tool_name": "Write",
    "tool_input": {
        "file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/process_url.py",
        "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.application.use_cases.extract_seo_data import ExtractInput, ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n    fetcher: HttpFetcherPort\n    html_parser: HtmlParserPort\n    sd_parser: StructuredDataParserPort\n    pages: PageRepositoryPort\n    urls: UrlRepositoryPort\n    links: LinkRepositoryPort\n    images: ImageRepositoryPort\n    structured: StructuredDataRepositoryPort\n    hreflang: HreflangRepositoryPort\n    queue: UrlQueuePort\n    classifier: LinkClassifier\n    normalizer: UrlNormalizer\n    extractor: ExtractSeoDataUseCase\n    clock: ClockPort\n\n\n@dataclass(frozen=True, slots=True)\nclass _PageContext:\n    crawl_id: int\n    page_id: int\n    source_url_id: int\n    now: datetime\n\n\nclass ProcessUrlUseCase:\n    def __init__(self, deps: ProcessUrlDeps) -> None:\n        self._d = deps\n\n    async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n        now = self._d.clock.now()\n        fetch = await self._d.fetcher.fetch(url)\n        html = fetch.body.decode(\"utf-8\", errors=\"replace\")\n        parsed = self._d.html_parser.parse(html, base_url=url.url)\n        url_entity = _upsert_url(self._d.urls, url, now=now, is_internal=True)\n        extracted = self._d.extractor.execute(\n            ExtractInput(\n                crawl_id=crawl_id, url_id=_id(url_entity), fetched_at=now,\n                render_mode=RenderMode.HTTP, fetch=fetch, parsed=parsed, page_url=url.url,\n            ),\n        )\n        page = self._d.pages.save(extracted.page)\n        ctx = _PageContext(crawl_id, _id(page), _id(url_entity), now)\n        _persist_links(self._d, ctx, parsed)\n        _persist_images(self._d.images, ctx.page_id, parsed)\n        self._d.hreflang.save_many(ctx.page_id, parsed.hreflang)\n        _persist_structured_data(self._d, ctx.page_id, html, base_url=url.url)\n        self._d.queue.mark_done(crawl_id, url)\n        return page\n\n\ndef _upsert_url(\n    repo: UrlRepositoryPort, normalized: NormalizedUrl, *, now: datetime, is_internal: bool,\n) -> Url:\n    return repo.upsert(\n        Url(\n            id=None, normalized=normalized, is_internal=is_internal,\n            first_seen_at=now, last_seen_at=now,\n        ),\n    )\n\n\ndef _persist_links(deps: ProcessUrlDeps, ctx: _PageContext, parsed: ParsedDocument) -> None:\n    links: list[Link] = []\n    for raw in parsed.internal_links:\n        normalized = deps.normalizer.normalize(raw)\n        target = _upsert_url(deps.urls, normalized, now=ctx.now, is_internal=True)\n        links.append(_make_link(ctx, normalized.url, _id(target), is_internal=True))\n        deps.queue.enqueue(ctx.crawl_id, normalized)\n    for raw in parsed.external_links:\n        normalized = deps.normalizer.normalize(raw)\n        links.append(_make_link(ctx, normalized.url, None, is_internal=False))\n    deps.links.save_many(links)\n\n\ndef _persist_images(repo: ImageRepositoryPort, page_id: int, parsed: ParsedDocument) -> None:\n    imgs = [\n        Image(id=None, page_id=page_id, src=s, alt=a, title=t, width=w, height=h,\n              loading_attr=None, decoding_attr=None, http_status=None, content_length=None)\n        for s, a, t, w, h in parsed.image_specs\n    ]\n    repo.save_many(imgs)\n\n\ndef _persist_structured_data(\n    deps: ProcessUrlDeps, page_id: int, html: str, *, base_url: str,\n) -> None:\n    items = deps.sd_parser.parse(html, base_url=base_url)\n    sd = [\n        StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n        for f, t, p in items\n    ]\n    deps.structured.save_many(sd)\n\n\ndef _id(entity: Page | Url) -> int:\n    if entity.id is None:\n        raise RuntimeError(f\"{type(entity).__name__} has no id after save\")\n    return entity.id\n\n\ndef _make_link(\n    ctx: _PageContext, target_url: str, target_url_id: int | None, *, is_internal: bool,\n) -> Link:\n    return Link(\n        id=None, page_id=ctx.page_id, source_url_id=ctx.source_url_id, target_url=target_url,\n        target_url_id=target_url_id, anchor_text=None, rel=frozenset(),\n        is_internal=is_internal, is_external=not is_internal, position_hint=None,\n    )\n"
    }
}

Response

{
    "tool_response": {
        "type": "update",
        "filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/process_url.py",
        "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.application.use_cases.extract_seo_data import ExtractInput, ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n    fetcher: HttpFetcherPort\n    html_parser: HtmlParserPort\n    sd_parser: StructuredDataParserPort\n    pages: PageRepositoryPort\n    urls: UrlRepositoryPort\n    links: LinkRepositoryPort\n    images: ImageRepositoryPort\n    structured: StructuredDataRepositoryPort\n    hreflang: HreflangRepositoryPort\n    queue: UrlQueuePort\n    classifier: LinkClassifier\n    normalizer: UrlNormalizer\n    extractor: ExtractSeoDataUseCase\n    clock: ClockPort\n\n\n@dataclass(frozen=True, slots=True)\nclass _PageContext:\n    crawl_id: int\n    page_id: int\n    source_url_id: int\n    now: datetime\n\n\nclass ProcessUrlUseCase:\n    def __init__(self, deps: ProcessUrlDeps) -> None:\n        self._d = deps\n\n    async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n        now = self._d.clock.now()\n        fetch = await self._d.fetcher.fetch(url)\n        html = fetch.body.decode(\"utf-8\", errors=\"replace\")\n        parsed = self._d.html_parser.parse(html, base_url=url.url)\n        url_entity = _upsert_url(self._d.urls, url, now=now, is_internal=True)\n        extracted = self._d.extractor.execute(\n            ExtractInput(\n                crawl_id=crawl_id, url_id=_id(url_entity), fetched_at=now,\n                render_mode=RenderMode.HTTP, fetch=fetch, parsed=parsed, page_url=url.url,\n            ),\n        )\n        page = self._d.pages.save(extracted.page)\n        ctx = _PageContext(crawl_id, _id(page), _id(url_entity), now)\n        _persist_links(self._d, ctx, parsed)\n        _persist_images(self._d.images, ctx.page_id, parsed)\n        self._d.hreflang.save_many(ctx.page_id, parsed.hreflang)\n        _persist_structured_data(self._d, ctx.page_id, html, base_url=url.url)\n        self._d.queue.mark_done(crawl_id, url)\n        return page\n\n\ndef _upsert_url(\n    repo: UrlRepositoryPort, normalized: NormalizedUrl, *, now: datetime, is_internal: bool,\n) -> Url:\n    return repo.upsert(\n        Url(\n            id=None, normalized=normalized, is_internal=is_internal,\n            first_seen_at=now, last_seen_at=now,\n        ),\n    )\n\n\ndef _persist_links(deps: ProcessUrlDeps, ctx: _PageContext, parsed: ParsedDocument) -> None:\n    links: list[Link] = []\n    for raw in parsed.internal_links:\n        normalized = deps.normalizer.normalize(raw)\n        target = _upsert_url(deps.urls, normalized, now=ctx.now, is_internal=True)\n        links.append(_make_link(ctx, normalized.url, _id(target), is_internal=True))\n        deps.queue.enqueue(ctx.crawl_id, normalized)\n    for raw in parsed.external_links:\n        normalized = deps.normalizer.normalize(raw)\n        links.append(_make_link(ctx, normalized.url, None, is_internal=False))\n    deps.links.save_many(links)\n\n\ndef _persist_images(repo: ImageRepositoryPort, page_id: int, parsed: ParsedDocument) -> None:\n    imgs = [\n        Image(id=None, page_id=page_id, src=s, alt=a, title=t, width=w, height=h,\n              loading_attr=None, decoding_attr=None, http_status=None, content_length=None)\n        for s, a, t, w, h in parsed.image_specs\n    ]\n    repo.save_many(imgs)\n\n\ndef _persist_structured_data(\n    deps: ProcessUrlDeps, page_id: int, html: str, *, base_url: str,\n) -> None:\n    items = deps.sd_parser.parse(html, base_url=base_url)\n    sd = [\n        StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n        for f, t, p in items\n    ]\n    deps.structured.save_many(sd)\n\n\ndef _id(entity: Page | Url) -> int:\n    if entity.id is None:\n        raise RuntimeError(f\"{type(entity).__name__} has no id after save\")\n    return entity.id\n\n\ndef _make_link(\n    ctx: _PageContext, target_url: str, target_url_id: int | None, *, is_internal: bool,\n) -> Link:\n    return Link(\n        id=None, page_id=ctx.page_id, source_url_id=ctx.source_url_id, target_url=target_url,\n        target_url_id=target_url_id, anchor_text=None, rel=frozenset(),\n        is_internal=is_internal, is_external=not is_internal, position_hint=None,\n    )\n",
        "structuredPatch": [
            {
                "oldStart": 45,
                "oldLines": 10,
                "newStart": 45,
                "newLines": 10,
                "lines": [
                    " ",
                    " ",
                    " @dataclass(frozen=True, slots=True)",
                    "-class _LinksContext:",
                    "+class _PageContext:",
                    "+    crawl_id: int",
                    "     page_id: int",
                    "     source_url_id: int",
                    "-    crawl_id: int",
                    "     now: datetime",
                    " ",
                    " "
                ]
            },
            {
                "oldStart": 59,
                "oldLines": 100,
                "newStart": 59,
                "newLines": 69,
                "lines": [
                    "     async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:",
                    "         now = self._d.clock.now()",
                    "         fetch = await self._d.fetcher.fetch(url)",
                    "-        parsed = self._d.html_parser.parse(",
                    "-            fetch.body.decode(\"utf-8\", errors=\"replace\"),",
                    "-            base_url=url.url,",
                    "-        )",
                    "-        url_entity = self._upsert_url(url, now=now, is_internal=True)",
                    "+        html = fetch.body.decode(\"utf-8\", errors=\"replace\")",
                    "+        parsed = self._d.html_parser.parse(html, base_url=url.url)",
                    "+        url_entity = _upsert_url(self._d.urls, url, now=now, is_internal=True)",
                    "         extracted = self._d.extractor.execute(",
                    "             ExtractInput(",
                    "-                crawl_id=crawl_id,",
                    "-                url_id=_id(url_entity),",
                    "-                fetched_at=now,",
                    "-                render_mode=RenderMode.HTTP,",
                    "-                fetch=fetch,",
                    "-                parsed=parsed,",
                    "-                page_url=url.url,",
                    "+                crawl_id=crawl_id, url_id=_id(url_entity), fetched_at=now,",
                    "+                render_mode=RenderMode.HTTP, fetch=fetch, parsed=parsed, page_url=url.url,",
                    "             ),",
                    "         )",
                    "         page = self._d.pages.save(extracted.page)",
                    "-        ctx = _LinksContext(",
                    "-            page_id=_id(page), source_url_id=_id(url_entity), crawl_id=crawl_id, now=now",
                    "-        )",
                    "-        self._persist_links(ctx, parsed=parsed)",
                    "-        self._persist_images(page_id=_id(page), parsed=parsed)",
                    "-        self._d.hreflang.save_many(_id(page), parsed.hreflang)",
                    "-        self._persist_structured_data(page_id=_id(page), html=fetch.body, base_url=url.url)",
                    "+        ctx = _PageContext(crawl_id, _id(page), _id(url_entity), now)",
                    "+        _persist_links(self._d, ctx, parsed)",
                    "+        _persist_images(self._d.images, ctx.page_id, parsed)",
                    "+        self._d.hreflang.save_many(ctx.page_id, parsed.hreflang)",
                    "+        _persist_structured_data(self._d, ctx.page_id, html, base_url=url.url)",
                    "         self._d.queue.mark_done(crawl_id, url)",
                    "         return page",
                    " ",
                    "-    def _upsert_url(self, normalized: NormalizedUrl, *, now: datetime, is_internal: bool) -> Url:",
                    "-        return self._d.urls.upsert(",
                    "-            Url(",
                    "-                id=None,",
                    "-                normalized=normalized,",
                    "-                is_internal=is_internal,",
                    "-                first_seen_at=now,",
                    "-                last_seen_at=now,",
                    "-            )",
                    "-        )",
                    " ",
                    "-    def _persist_links(self, ctx: _LinksContext, *, parsed: ParsedDocument) -> None:",
                    "-        links: list[Link] = []",
                    "-        for raw in parsed.internal_links:",
                    "-            normalized = self._d.normalizer.normalize(raw)",
                    "-            target = self._upsert_url(normalized, now=ctx.now, is_internal=True)",
                    "-            links.append(",
                    "-                _make_link(",
                    "-                    ctx.page_id,",
                    "-                    ctx.source_url_id,",
                    "-                    normalized.url,",
                    "-                    target_url_id=_id(target),",
                    "-                    is_internal=True,",
                    "-                )",
                    "-            )",
                    "-            self._d.queue.enqueue(ctx.crawl_id, normalized)",
                    "-        for raw in parsed.external_links:",
                    "-            normalized = self._d.normalizer.normalize(raw)",
                    "-            links.append(",
                    "-                _make_link(",
                    "-                    ctx.page_id,",
                    "-                    ctx.source_url_id,",
                    "-                    normalized.url,",
                    "-                    target_url_id=None,",
                    "-                    is_internal=False,",
                    "-                )",
                    "-            )",
                    "-        self._d.links.save_many(links)",
                    "+def _upsert_url(",
                    "+    repo: UrlRepositoryPort, normalized: NormalizedUrl, *, now: datetime, is_internal: bool,",
                    "+) -> Url:",
                    "+    return repo.upsert(",
                    "+        Url(",
                    "+            id=None, normalized=normalized, is_internal=is_internal,",
                    "+            first_seen_at=now, last_seen_at=now,",
                    "+        ),",
                    "+    )",
                    " ",
                    "-    def _persist_images(self, *, page_id: int, parsed: ParsedDocument) -> None:",
                    "-        imgs = [",
                    "-            Image(",
                    "-                id=None,",
                    "-                page_id=page_id,",
                    "-                src=s,",
                    "-                alt=a,",
                    "-                title=t,",
                    "-                width=w,",
                    "-                height=h,",
                    "-                loading_attr=None,",
                    "-                decoding_attr=None,",
                    "-                http_status=None,",
                    "-                content_length=None,",
                    "-            )",
                    "-            for s, a, t, w, h in parsed.image_specs",
                    "-        ]",
                    "-        self._d.images.save_many(imgs)",
                    " ",
                    "-    def _persist_structured_data(self, *, page_id: int, html: bytes, base_url: str) -> None:",
                    "-        items = self._d.sd_parser.parse(html.decode(\"utf-8\", errors=\"replace\"), base_url=base_url)",
                    "-        sd = [",
                    "-            StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)",
                    "-            for f, t, p in items",
                    "-        ]",
                    "-        self._d.structured.save_many(sd)",
                    "+def _persist_links(deps: ProcessUrlDeps, ctx: _PageContext, parsed: ParsedDocument) -> None:",
                    "+    links: list[Link] = []",
                    "+    for raw in parsed.internal_links:",
                    "+        normalized = deps.normalizer.normalize(raw)",
                    "+        target = _upsert_url(deps.urls, normalized, now=ctx.now, is_internal=True)",
                    "+        links.append(_make_link(ctx, normalized.url, _id(target), is_internal=True))",
                    "+        deps.queue.enqueue(ctx.crawl_id, normalized)",
                    "+    for raw in parsed.external_links:",
                    "+        normalized = deps.normalizer.normalize(raw)",
                    "+        links.append(_make_link(ctx, normalized.url, None, is_internal=False))",
                    "+    deps.links.save_many(links)",
                    " ",
                    " ",
                    "+def _persist_images(repo: ImageRepositoryPort, page_id: int, parsed: ParsedDocument) -> None:",
                    "+    imgs = [",
                    "+        Image(id=None, page_id=page_id, src=s, alt=a, title=t, width=w, height=h,",
                    "+              loading_attr=None, decoding_attr=None, http_status=None, content_length=None)",
                    "+        for s, a, t, w, h in parsed.image_specs",
                    "+    ]",
                    "+    repo.save_many(imgs)",
                    "+",
                    "+",
                    "+def _persist_structured_data(",
                    "+    deps: ProcessUrlDeps, page_id: int, html: str, *, base_url: str,",
                    "+) -> None:",
                    "+    items = deps.sd_parser.parse(html, base_url=base_url)",
                    "+    sd = [",
                    "+        StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)",
                    "+        for f, t, p in items",
                    "+    ]",
                    "+    deps.structured.save_many(sd)",
                    "+",
                    "+",
                    " def _id(entity: Page | Url) -> int:",
                    "     if entity.id is None:",
                    "         raise RuntimeError(f\"{type(entity).__name__} has no id after save\")"
                ]
            },
            {
                "oldStart": 160,
                "oldLines": 22,
                "newStart": 129,
                "newLines": 10,
                "lines": [
                    " ",
                    " ",
                    " def _make_link(",
                    "-    page_id: int,",
                    "-    source_url_id: int,",
                    "-    target_url: str,",
                    "-    *,",
                    "-    target_url_id: int | None,",
                    "-    is_internal: bool,",
                    "+    ctx: _PageContext, target_url: str, target_url_id: int | None, *, is_internal: bool,",
                    " ) -> Link:",
                    "     return Link(",
                    "-        id=None,",
                    "-        page_id=page_id,",
                    "-        source_url_id=source_url_id,",
                    "-        target_url=target_url,",
                    "-        target_url_id=target_url_id,",
                    "-        anchor_text=None,",
                    "-        rel=frozenset(),",
                    "-        is_internal=is_internal,",
                    "-        is_external=not is_internal,",
                    "-        position_hint=None,",
                    "+        id=None, page_id=ctx.page_id, source_url_id=ctx.source_url_id, target_url=target_url,",
                    "+        target_url_id=target_url_id, anchor_text=None, rel=frozenset(),",
                    "+        is_internal=is_internal, is_external=not is_internal, position_hint=None,",
                    "     )"
                ]
            }
        ],
        "originalFile": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.application.use_cases.extract_seo_data import ExtractInput, ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n    fetcher: HttpFetcherPort\n    html_parser: HtmlParserPort\n    sd_parser: StructuredDataParserPort\n    pages: PageRepositoryPort\n    urls: UrlRepositoryPort\n    links: LinkRepositoryPort\n    images: ImageRepositoryPort\n    structured: StructuredDataRepositoryPort\n    hreflang: HreflangRepositoryPort\n    queue: UrlQueuePort\n    classifier: LinkClassifier\n    normalizer: UrlNormalizer\n    extractor: ExtractSeoDataUseCase\n    clock: ClockPort\n\n\n@dataclass(frozen=True, slots=True)\nclass _LinksContext:\n    page_id: int\n    source_url_id: int\n    crawl_id: int\n    now: datetime\n\n\nclass ProcessUrlUseCase:\n    def __init__(self, deps: ProcessUrlDeps) -> None:\n        self._d = deps\n\n    async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n        now = self._d.clock.now()\n        fetch = await self._d.fetcher.fetch(url)\n        parsed = self._d.html_parser.parse(\n            fetch.body.decode(\"utf-8\", errors=\"replace\"),\n            base_url=url.url,\n        )\n        url_entity = self._upsert_url(url, now=now, is_internal=True)\n        extracted = self._d.extractor.execute(\n            ExtractInput(\n                crawl_id=crawl_id,\n                url_id=_id(url_entity),\n                fetched_at=now,\n                render_mode=RenderMode.HTTP,\n                fetch=fetch,\n                parsed=parsed,\n                page_url=url.url,\n            ),\n        )\n        page = self._d.pages.save(extracted.page)\n        ctx = _LinksContext(\n            page_id=_id(page), source_url_id=_id(url_entity), crawl_id=crawl_id, now=now\n        )\n        self._persist_links(ctx, parsed=parsed)\n        self._persist_images(page_id=_id(page), parsed=parsed)\n        self._d.hreflang.save_many(_id(page), parsed.hreflang)\n        self._persist_structured_data(page_id=_id(page), html=fetch.body, base_url=url.url)\n        self._d.queue.mark_done(crawl_id, url)\n        return page\n\n    def _upsert_url(self, normalized: NormalizedUrl, *, now: datetime, is_internal: bool) -> Url:\n        return self._d.urls.upsert(\n            Url(\n                id=None,\n                normalized=normalized,\n                is_internal=is_internal,\n                first_seen_at=now,\n                last_seen_at=now,\n            )\n        )\n\n    def _persist_links(self, ctx: _LinksContext, *, parsed: ParsedDocument) -> None:\n        links: list[Link] = []\n        for raw in parsed.internal_links:\n            normalized = self._d.normalizer.normalize(raw)\n            target = self._upsert_url(normalized, now=ctx.now, is_internal=True)\n            links.append(\n                _make_link(\n                    ctx.page_id,\n                    ctx.source_url_id,\n                    normalized.url,\n                    target_url_id=_id(target),\n                    is_internal=True,\n                )\n            )\n            self._d.queue.enqueue(ctx.crawl_id, normalized)\n        for raw in parsed.external_links:\n            normalized = self._d.normalizer.normalize(raw)\n            links.append(\n                _make_link(\n                    ctx.page_id,\n                    ctx.source_url_id,\n                    normalized.url,\n                    target_url_id=None,\n                    is_internal=False,\n                )\n            )\n        self._d.links.save_many(links)\n\n    def _persist_images(self, *, page_id: int, parsed: ParsedDocument) -> None:\n        imgs = [\n            Image(\n                id=None,\n                page_id=page_id,\n                src=s,\n                alt=a,\n                title=t,\n                width=w,\n                height=h,\n                loading_attr=None,\n                decoding_attr=None,\n                http_status=None,\n                content_length=None,\n            )\n            for s, a, t, w, h in parsed.image_specs\n        ]\n        self._d.images.save_many(imgs)\n\n    def _persist_structured_data(self, *, page_id: int, html: bytes, base_url: str) -> None:\n        items = self._d.sd_parser.parse(html.decode(\"utf-8\", errors=\"replace\"), base_url=base_url)\n        sd = [\n            StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n            for f, t, p in items\n        ]\n        self._d.structured.save_many(sd)\n\n\ndef _id(entity: Page | Url) -> int:\n    if entity.id is None:\n        raise RuntimeError(f\"{type(entity).__name__} has no id after save\")\n    return entity.id\n\n\ndef _make_link(\n    page_id: int,\n    source_url_id: int,\n    target_url: str,\n    *,\n    target_url_id: int | None,\n    is_internal: bool,\n) -> Link:\n    return Link(\n        id=None,\n        page_id=page_id,\n        source_url_id=source_url_id,\n        target_url=target_url,\n        target_url_id=target_url_id,\n        anchor_text=None,\n        rel=frozenset(),\n        is_internal=is_internal,\n        is_external=not is_internal,\n        position_hint=None,\n    )\n",
        "userModified": false
    }
}
← Vorheriger Zur Liste Nächster →