Protokoll #33626

ID33626
Zeitstempel2026-04-21 02:57:12.865910
Clientroot
IP62.246.34.195
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens2,013 (Input: 148, Output: 1,865)
Dauer67 ms
Request-Zeit2026-04-21 02:57:12.865910
Response-Zeit2026-04-21 02:57:12.933198

Request

{
    "event": "PreToolUse",
    "tool_name": "Edit",
    "tool_input": {
        "file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/parsers\/html_parser_bs4.py",
        "old_string": "from urllib.parse import urljoin\n\nfrom bs4 import BeautifulSoup, Tag\n\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument",
        "new_string": "from urllib.parse import urljoin\n\nfrom bs4 import BeautifulSoup, Tag\n\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.value_objects.normalized_url import normalize",
        "replace_all": false
    }
}

Response

{
    "tool_response": {
        "filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/parsers\/html_parser_bs4.py",
        "oldString": "from urllib.parse import urljoin\n\nfrom bs4 import BeautifulSoup, Tag\n\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument",
        "newString": "from urllib.parse import urljoin\n\nfrom bs4 import BeautifulSoup, Tag\n\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.value_objects.normalized_url import normalize",
        "originalFile": "from __future__ import annotations\n\nimport hashlib\nfrom urllib.parse import urljoin\n\nfrom bs4 import BeautifulSoup, Tag\n\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.value_objects.body_stats import BodyStats\nfrom src.domain.value_objects.head_meta import HeadMeta\nfrom src.domain.value_objects.hreflang import HreflangEntry\nfrom src.domain.value_objects.meta_robots import parse_meta_robots\nfrom src.domain.value_objects.open_graph_meta import OpenGraphMeta\nfrom src.domain.value_objects.twitter_meta import TwitterMeta\n\n_ImageSpec = tuple[str, str | None, str | None, int | None, int | None]\n\n\nclass Bs4HtmlParser(HtmlParserPort):\n    def parse(self, html: str, *, base_url: str) -> ParsedDocument:\n        soup = BeautifulSoup(html, \"lxml\")\n        head = _parse_head(soup)\n        og = _parse_og(soup)\n        twitter = _parse_twitter(soup)\n        body = _parse_body(soup)\n        hreflang = _parse_hreflang(soup)\n        internal, external = _parse_links(soup, base_url=base_url)\n        images = _parse_images(soup, base_url=base_url)\n        return ParsedDocument(\n            head=head, og=og, twitter=twitter, body=body, hreflang=hreflang,\n            internal_links=internal, external_links=external, image_specs=images,\n        )\n\n\ndef _meta(soup: BeautifulSoup, name: str) -> str | None:\n    tag = soup.find(\"meta\", attrs={\"name\": name})\n    return _content(tag)\n\n\ndef _property(soup: BeautifulSoup, prop: str) -> str | None:\n    tag = soup.find(\"meta\", attrs={\"property\": prop})\n    return _content(tag)\n\n\ndef _content(tag: object) -> str | None:\n    if isinstance(tag, Tag):\n        value = tag.get(\"content\")\n        if isinstance(value, str):\n            return value\n    return None\n\n\ndef _parse_head(soup: BeautifulSoup) -> HeadMeta:\n    title_tag = soup.find(\"title\")\n    canonical = soup.find(\"link\", rel=\"canonical\")\n    canonical_href = canonical.get(\"href\") if isinstance(canonical, Tag) else None\n    lang_tag = soup.find(\"html\")\n    lang = lang_tag.get(\"lang\") if isinstance(lang_tag, Tag) else None\n    charset_tag = soup.find(\"meta\", attrs={\"charset\": True})\n    charset = charset_tag.get(\"charset\") if isinstance(charset_tag, Tag) else None\n    return HeadMeta(\n        title=title_tag.get_text(strip=True) if title_tag else None,\n        description=_meta(soup, \"description\"),\n        keywords=_meta(soup, \"keywords\"),\n        robots=parse_meta_robots(_meta(soup, \"robots\")),\n        canonical=str(canonical_href) if canonical_href else None,\n        lang=str(lang) if lang else None,\n        charset=str(charset) if charset else None,\n        viewport=_meta(soup, \"viewport\"),\n    )\n\n\ndef _parse_og(soup: BeautifulSoup) -> OpenGraphMeta:\n    return OpenGraphMeta(\n        title=_property(soup, \"og:title\"),\n        description=_property(soup, \"og:description\"),\n        image=_property(soup, \"og:image\"),\n        og_type=_property(soup, \"og:type\"),\n        url=_property(soup, \"og:url\"),\n        site_name=_property(soup, \"og:site_name\"),\n        locale=_property(soup, \"og:locale\"),\n    )\n\n\ndef _parse_twitter(soup: BeautifulSoup) -> TwitterMeta:\n    return TwitterMeta(\n        card=_meta(soup, \"twitter:card\"),\n        title=_meta(soup, \"twitter:title\"),\n        description=_meta(soup, \"twitter:description\"),\n        image=_meta(soup, \"twitter:image\"),\n    )\n\n\ndef _parse_body(soup: BeautifulSoup) -> BodyStats:\n    h1 = tuple(t.get_text(strip=True) for t in soup.find_all(\"h1\"))\n    h2 = tuple(t.get_text(strip=True) for t in soup.find_all(\"h2\"))\n    text = soup.get_text(separator=\" \", strip=True)\n    words = text.split()\n    text_hash = hashlib.sha256(text.encode(\"utf-8\")).hexdigest() if text else None\n    return BodyStats(\n        h1_texts=h1, h2_texts=h2,\n        h3_count=len(soup.find_all(\"h3\")),\n        h4_count=len(soup.find_all(\"h4\")),\n        h5_count=len(soup.find_all(\"h5\")),\n        h6_count=len(soup.find_all(\"h6\")),\n        word_count=len(words), text_hash=text_hash,\n    )\n\n\ndef _parse_hreflang(soup: BeautifulSoup) -> tuple[HreflangEntry, ...]:\n    result: list[HreflangEntry] = []\n    for link in soup.find_all(\"link\", rel=\"alternate\"):\n        if not isinstance(link, Tag):\n            continue\n        lang = link.get(\"hreflang\")\n        href = link.get(\"href\")\n        if isinstance(lang, str) and isinstance(href, str):\n            result.append(HreflangEntry(lang=lang, href=href))\n    return tuple(result)\n\n\ndef _parse_links(\n    soup: BeautifulSoup, *, base_url: str,\n) -> tuple[tuple[str, ...], tuple[str, ...]]:\n    from src.domain.value_objects.normalized_url import normalize\n\n    internal: list[str] = []\n    external: list[str] = []\n    base_host = normalize(base_url).host\n    for a in soup.find_all(\"a\", href=True):\n        if not isinstance(a, Tag):\n            continue\n        href = a.get(\"href\")\n        if not isinstance(href, str):\n            continue\n        absolute = urljoin(base_url, href)\n        if absolute.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n            continue\n        host = normalize(absolute).host\n        if host == base_host or host.endswith(\".\" + base_host):\n            internal.append(absolute)\n        else:\n            external.append(absolute)\n    return tuple(internal), tuple(external)\n\n\ndef _parse_images(\n    soup: BeautifulSoup, *, base_url: str,\n) -> tuple[_ImageSpec, ...]:\n    result: list[_ImageSpec] = []\n    for img in soup.find_all(\"img\"):\n        if not isinstance(img, Tag):\n            continue\n        src = img.get(\"src\")\n        if not isinstance(src, str):\n            continue\n        result.append((\n            urljoin(base_url, src),\n            _str_or_none(img.get(\"alt\")),\n            _str_or_none(img.get(\"title\")),\n            _int_or_none(img.get(\"width\")),\n            _int_or_none(img.get(\"height\")),\n        ))\n    return tuple(result)\n\n\ndef _str_or_none(value: object) -> str | None:\n    return value if isinstance(value, str) else None\n\n\ndef _int_or_none(value: object) -> int | None:\n    if not isinstance(value, str):\n        return None\n    try:\n        return int(value)\n    except ValueError:\n        return None\n",
        "structuredPatch": [
            {
                "oldStart": 6,
                "oldLines": 6,
                "newStart": 6,
                "newLines": 7,
                "lines": [
                    " from bs4 import BeautifulSoup, Tag",
                    " ",
                    " from src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument",
                    "+from src.domain.value_objects.normalized_url import normalize",
                    " from src.domain.value_objects.body_stats import BodyStats",
                    " from src.domain.value_objects.head_meta import HeadMeta",
                    " from src.domain.value_objects.hreflang import HreflangEntry"
                ]
            }
        ],
        "userModified": false,
        "replaceAll": false
    }
}
← Vorheriger Zur Liste Nächster →