{
"tool_response": {
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/parsers\/html_parser_bs4.py",
"oldString": "def _parse_links(\n soup: BeautifulSoup, *, base_url: str,\n) -> tuple[tuple[str, ...], tuple[str, ...]]:\n from src.domain.value_objects.normalized_url import normalize\n\n internal: list[str] = []",
"newString": "def _parse_links(\n soup: BeautifulSoup, *, base_url: str,\n) -> tuple[tuple[str, ...], tuple[str, ...]]:\n internal: list[str] = []",
"originalFile": "from __future__ import annotations\n\nimport hashlib\nfrom urllib.parse import urljoin\n\nfrom bs4 import BeautifulSoup, Tag\n\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.value_objects.normalized_url import normalize\nfrom src.domain.value_objects.body_stats import BodyStats\nfrom src.domain.value_objects.head_meta import HeadMeta\nfrom src.domain.value_objects.hreflang import HreflangEntry\nfrom src.domain.value_objects.meta_robots import parse_meta_robots\nfrom src.domain.value_objects.open_graph_meta import OpenGraphMeta\nfrom src.domain.value_objects.twitter_meta import TwitterMeta\n\n_ImageSpec = tuple[str, str | None, str | None, int | None, int | None]\n\n\nclass Bs4HtmlParser(HtmlParserPort):\n def parse(self, html: str, *, base_url: str) -> ParsedDocument:\n soup = BeautifulSoup(html, \"lxml\")\n head = _parse_head(soup)\n og = _parse_og(soup)\n twitter = _parse_twitter(soup)\n body = _parse_body(soup)\n hreflang = _parse_hreflang(soup)\n internal, external = _parse_links(soup, base_url=base_url)\n images = _parse_images(soup, base_url=base_url)\n return ParsedDocument(\n head=head, og=og, twitter=twitter, body=body, hreflang=hreflang,\n internal_links=internal, external_links=external, image_specs=images,\n )\n\n\ndef _meta(soup: BeautifulSoup, name: str) -> str | None:\n tag = soup.find(\"meta\", attrs={\"name\": name})\n return _content(tag)\n\n\ndef _property(soup: BeautifulSoup, prop: str) -> str | None:\n tag = soup.find(\"meta\", attrs={\"property\": prop})\n return _content(tag)\n\n\ndef _content(tag: object) -> str | None:\n if isinstance(tag, Tag):\n value = tag.get(\"content\")\n if isinstance(value, str):\n return value\n return None\n\n\ndef _parse_head(soup: BeautifulSoup) -> HeadMeta:\n title_tag = soup.find(\"title\")\n canonical = soup.find(\"link\", rel=\"canonical\")\n canonical_href = canonical.get(\"href\") if isinstance(canonical, Tag) else None\n lang_tag = soup.find(\"html\")\n lang = lang_tag.get(\"lang\") if isinstance(lang_tag, Tag) else None\n charset_tag = soup.find(\"meta\", attrs={\"charset\": True})\n charset = charset_tag.get(\"charset\") if isinstance(charset_tag, Tag) else None\n return HeadMeta(\n title=title_tag.get_text(strip=True) if title_tag else None,\n description=_meta(soup, \"description\"),\n keywords=_meta(soup, \"keywords\"),\n robots=parse_meta_robots(_meta(soup, \"robots\")),\n canonical=str(canonical_href) if canonical_href else None,\n lang=str(lang) if lang else None,\n charset=str(charset) if charset else None,\n viewport=_meta(soup, \"viewport\"),\n )\n\n\ndef _parse_og(soup: BeautifulSoup) -> OpenGraphMeta:\n return OpenGraphMeta(\n title=_property(soup, \"og:title\"),\n description=_property(soup, \"og:description\"),\n image=_property(soup, \"og:image\"),\n og_type=_property(soup, \"og:type\"),\n url=_property(soup, \"og:url\"),\n site_name=_property(soup, \"og:site_name\"),\n locale=_property(soup, \"og:locale\"),\n )\n\n\ndef _parse_twitter(soup: BeautifulSoup) -> TwitterMeta:\n return TwitterMeta(\n card=_meta(soup, \"twitter:card\"),\n title=_meta(soup, \"twitter:title\"),\n description=_meta(soup, \"twitter:description\"),\n image=_meta(soup, \"twitter:image\"),\n )\n\n\ndef _parse_body(soup: BeautifulSoup) -> BodyStats:\n h1 = tuple(t.get_text(strip=True) for t in soup.find_all(\"h1\"))\n h2 = tuple(t.get_text(strip=True) for t in soup.find_all(\"h2\"))\n text = soup.get_text(separator=\" \", strip=True)\n words = text.split()\n text_hash = hashlib.sha256(text.encode(\"utf-8\")).hexdigest() if text else None\n return BodyStats(\n h1_texts=h1, h2_texts=h2,\n h3_count=len(soup.find_all(\"h3\")),\n h4_count=len(soup.find_all(\"h4\")),\n h5_count=len(soup.find_all(\"h5\")),\n h6_count=len(soup.find_all(\"h6\")),\n word_count=len(words), text_hash=text_hash,\n )\n\n\ndef _parse_hreflang(soup: BeautifulSoup) -> tuple[HreflangEntry, ...]:\n result: list[HreflangEntry] = []\n for link in soup.find_all(\"link\", rel=\"alternate\"):\n if not isinstance(link, Tag):\n continue\n lang = link.get(\"hreflang\")\n href = link.get(\"href\")\n if isinstance(lang, str) and isinstance(href, str):\n result.append(HreflangEntry(lang=lang, href=href))\n return tuple(result)\n\n\ndef _parse_links(\n soup: BeautifulSoup, *, base_url: str,\n) -> tuple[tuple[str, ...], tuple[str, ...]]:\n from src.domain.value_objects.normalized_url import normalize\n\n internal: list[str] = []\n external: list[str] = []\n base_host = normalize(base_url).host\n for a in soup.find_all(\"a\", href=True):\n if not isinstance(a, Tag):\n continue\n href = a.get(\"href\")\n if not isinstance(href, str):\n continue\n absolute = urljoin(base_url, href)\n if absolute.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n continue\n host = normalize(absolute).host\n if host == base_host or host.endswith(\".\" + base_host):\n internal.append(absolute)\n else:\n external.append(absolute)\n return tuple(internal), tuple(external)\n\n\ndef _parse_images(\n soup: BeautifulSoup, *, base_url: str,\n) -> tuple[_ImageSpec, ...]:\n result: list[_ImageSpec] = []\n for img in soup.find_all(\"img\"):\n if not isinstance(img, Tag):\n continue\n src = img.get(\"src\")\n if not isinstance(src, str):\n continue\n result.append((\n urljoin(base_url, src),\n _str_or_none(img.get(\"alt\")),\n _str_or_none(img.get(\"title\")),\n _int_or_none(img.get(\"width\")),\n _int_or_none(img.get(\"height\")),\n ))\n return tuple(result)\n\n\ndef _str_or_none(value: object) -> str | None:\n return value if isinstance(value, str) else None\n\n\ndef _int_or_none(value: object) -> int | None:\n if not isinstance(value, str):\n return None\n try:\n return int(value)\n except ValueError:\n return None\n",
"structuredPatch": [
{
"oldStart": 123,
"oldLines": 8,
"newStart": 123,
"newLines": 6,
"lines": [
" def _parse_links(",
" soup: BeautifulSoup, *, base_url: str,",
" ) -> tuple[tuple[str, ...], tuple[str, ...]]:",
"- from src.domain.value_objects.normalized_url import normalize",
"-",
" internal: list[str] = []",
" external: list[str] = []",
" base_host = normalize(base_url).host"
]
}
],
"userModified": false,
"replaceAll": false
}
}