{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/tests\/unit\/application\/test_process_url.py",
"content": "from __future__ import annotations\n\nfrom collections.abc import Mapping\nfrom datetime import datetime\n\nimport pytest\n\nfrom src.application.use_cases.extract_seo_data import ExtractSeoDataUseCase\nfrom src.application.use_cases.process_url import ProcessUrlDeps, ProcessUrlUseCase\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import FetchResult, HttpFetcherPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.quality_flag_calculator import QualityFlagCalculator\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.body_stats import BodyStats\nfrom src.domain.value_objects.enums import StructuredDataFormat\nfrom src.domain.value_objects.head_meta import HeadMeta\nfrom src.domain.value_objects.normalized_url import NormalizedUrl, normalize\nfrom src.domain.value_objects.open_graph_meta import OpenGraphMeta\nfrom src.domain.value_objects.twitter_meta import TwitterMeta\nfrom tests.unit.application.fakes import (\n FakeClock,\n FakeHreflangRepo,\n FakeImageRepo,\n FakeLinkRepo,\n FakePageRepo,\n FakeStructuredDataRepo,\n FakeUrlQueue,\n FakeUrlRepo,\n)\n\n\nclass _StubFetcher(HttpFetcherPort):\n async def fetch(self, url: NormalizedUrl) -> FetchResult:\n headers: Mapping[str, str] = {\"content-type\": \"text\/html\"}\n return FetchResult(url.url, 200, (), headers, b\"<html><\/html>\", 5, None, None)\n\n\nclass _StubHtmlParser(HtmlParserPort):\n def parse(self, html: str, *, base_url: str) -> ParsedDocument:\n _ = html, base_url\n return ParsedDocument(\n head=HeadMeta(\"T\" * 40, \"D\" * 80, None, None, base_url, \"de\", \"utf-8\", None),\n og=OpenGraphMeta(None, None, None, None, None, None, None),\n twitter=TwitterMeta(None, None, None, None),\n body=BodyStats((\"H\",), (), 0, 0, 0, 0, 100, None),\n hreflang=(),\n internal_links=(\"https:\/\/example.com\/inner\",),\n external_links=(\"https:\/\/ext\/\",),\n image_specs=((\"\/img.png\", \"alt\", None, 10, 10),),\n )\n\n\nclass _StubSdParser(StructuredDataParserPort):\n def parse(\n self, html: str, *, base_url: str,\n ) -> list[tuple[StructuredDataFormat, str | None, Mapping[str, object]]]:\n _ = html, base_url\n return []\n\n\n@pytest.mark.asyncio\nasync def test_process_url_persists_page_and_discovers_links() -> None:\n pages = FakePageRepo()\n urls = FakeUrlRepo()\n links = FakeLinkRepo()\n images = FakeImageRepo()\n structured = FakeStructuredDataRepo()\n hreflang = FakeHreflangRepo()\n queue = FakeUrlQueue()\n deps = ProcessUrlDeps(\n fetcher=_StubFetcher(),\n html_parser=_StubHtmlParser(),\n sd_parser=_StubSdParser(),\n pages=pages,\n urls=urls,\n links=links,\n images=images,\n structured=structured,\n hreflang=hreflang,\n queue=queue,\n classifier=LinkClassifier(registered_domain=\"example.com\"),\n normalizer=UrlNormalizer(),\n extractor=ExtractSeoDataUseCase(calculator=QualityFlagCalculator()),\n clock=FakeClock(datetime(2026, 4, 21, 12, 0)),\n )\n uc = ProcessUrlUseCase(deps)\n start = normalize(\"https:\/\/example.com\/\")\n\n page = await uc.execute(crawl_id=1, url=start)\n\n assert page.id == 1\n assert len(links.store) == 2\n assert len(images.store) == 1\n assert queue.pending_count(1) == 1\n assert (1, start) in queue.done\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/tests\/unit\/application\/test_process_url.py",
"content": "from __future__ import annotations\n\nfrom collections.abc import Mapping\nfrom datetime import datetime\n\nimport pytest\n\nfrom src.application.use_cases.extract_seo_data import ExtractSeoDataUseCase\nfrom src.application.use_cases.process_url import ProcessUrlDeps, ProcessUrlUseCase\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import FetchResult, HttpFetcherPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.quality_flag_calculator import QualityFlagCalculator\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.body_stats import BodyStats\nfrom src.domain.value_objects.enums import StructuredDataFormat\nfrom src.domain.value_objects.head_meta import HeadMeta\nfrom src.domain.value_objects.normalized_url import NormalizedUrl, normalize\nfrom src.domain.value_objects.open_graph_meta import OpenGraphMeta\nfrom src.domain.value_objects.twitter_meta import TwitterMeta\nfrom tests.unit.application.fakes import (\n FakeClock,\n FakeHreflangRepo,\n FakeImageRepo,\n FakeLinkRepo,\n FakePageRepo,\n FakeStructuredDataRepo,\n FakeUrlQueue,\n FakeUrlRepo,\n)\n\n\nclass _StubFetcher(HttpFetcherPort):\n async def fetch(self, url: NormalizedUrl) -> FetchResult:\n headers: Mapping[str, str] = {\"content-type\": \"text\/html\"}\n return FetchResult(url.url, 200, (), headers, b\"<html><\/html>\", 5, None, None)\n\n\nclass _StubHtmlParser(HtmlParserPort):\n def parse(self, html: str, *, base_url: str) -> ParsedDocument:\n _ = html, base_url\n return ParsedDocument(\n head=HeadMeta(\"T\" * 40, \"D\" * 80, None, None, base_url, \"de\", \"utf-8\", None),\n og=OpenGraphMeta(None, None, None, None, None, None, None),\n twitter=TwitterMeta(None, None, None, None),\n body=BodyStats((\"H\",), (), 0, 0, 0, 0, 100, None),\n hreflang=(),\n internal_links=(\"https:\/\/example.com\/inner\",),\n external_links=(\"https:\/\/ext\/\",),\n image_specs=((\"\/img.png\", \"alt\", None, 10, 10),),\n )\n\n\nclass _StubSdParser(StructuredDataParserPort):\n def parse(\n self, html: str, *, base_url: str,\n ) -> list[tuple[StructuredDataFormat, str | None, Mapping[str, object]]]:\n _ = html, base_url\n return []\n\n\n@pytest.mark.asyncio\nasync def test_process_url_persists_page_and_discovers_links() -> None:\n pages = FakePageRepo()\n urls = FakeUrlRepo()\n links = FakeLinkRepo()\n images = FakeImageRepo()\n structured = FakeStructuredDataRepo()\n hreflang = FakeHreflangRepo()\n queue = FakeUrlQueue()\n deps = ProcessUrlDeps(\n fetcher=_StubFetcher(),\n html_parser=_StubHtmlParser(),\n sd_parser=_StubSdParser(),\n pages=pages,\n urls=urls,\n links=links,\n images=images,\n structured=structured,\n hreflang=hreflang,\n queue=queue,\n classifier=LinkClassifier(registered_domain=\"example.com\"),\n normalizer=UrlNormalizer(),\n extractor=ExtractSeoDataUseCase(calculator=QualityFlagCalculator()),\n clock=FakeClock(datetime(2026, 4, 21, 12, 0)),\n )\n uc = ProcessUrlUseCase(deps)\n start = normalize(\"https:\/\/example.com\/\")\n\n page = await uc.execute(crawl_id=1, url=start)\n\n assert page.id == 1\n assert len(links.store) == 2\n assert len(images.store) == 1\n assert queue.pending_count(1) == 1\n assert (1, start) in queue.done\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}