{
"tool_response": {
"type": "update",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/process_url.py",
"content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.application.use_cases.extract_seo_data import ExtractInput, ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n fetcher: HttpFetcherPort\n html_parser: HtmlParserPort\n sd_parser: StructuredDataParserPort\n pages: PageRepositoryPort\n urls: UrlRepositoryPort\n links: LinkRepositoryPort\n images: ImageRepositoryPort\n structured: StructuredDataRepositoryPort\n hreflang: HreflangRepositoryPort\n queue: UrlQueuePort\n classifier: LinkClassifier\n normalizer: UrlNormalizer\n extractor: ExtractSeoDataUseCase\n clock: ClockPort\n\n\n@dataclass(frozen=True, slots=True)\nclass _PageContext:\n crawl_id: int\n page_id: int\n source_url_id: int\n now: datetime\n\n\nclass ProcessUrlUseCase:\n def __init__(self, deps: ProcessUrlDeps) -> None:\n self._d = deps\n\n async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n now = self._d.clock.now()\n fetch = await self._d.fetcher.fetch(url)\n html = fetch.body.decode(\"utf-8\", errors=\"replace\")\n parsed = self._d.html_parser.parse(html, base_url=url.url)\n url_entity = _upsert_url(self._d.urls, url, now=now, is_internal=True)\n extracted = self._d.extractor.execute(\n ExtractInput(\n crawl_id=crawl_id, url_id=_id(url_entity), fetched_at=now,\n render_mode=RenderMode.HTTP, fetch=fetch, parsed=parsed, page_url=url.url,\n ),\n )\n page = self._d.pages.save(extracted.page)\n ctx = _PageContext(crawl_id, _id(page), _id(url_entity), now)\n _persist_links(self._d, ctx, parsed)\n _persist_images(self._d.images, ctx.page_id, parsed)\n self._d.hreflang.save_many(ctx.page_id, parsed.hreflang)\n _persist_structured_data(self._d, ctx.page_id, html, base_url=url.url)\n self._d.queue.mark_done(crawl_id, url)\n return page\n\n\ndef _upsert_url(\n repo: UrlRepositoryPort, normalized: NormalizedUrl, *, now: datetime, is_internal: bool,\n) -> Url:\n return repo.upsert(\n Url(\n id=None, normalized=normalized, is_internal=is_internal,\n first_seen_at=now, last_seen_at=now,\n ),\n )\n\n\ndef _persist_links(deps: ProcessUrlDeps, ctx: _PageContext, parsed: ParsedDocument) -> None:\n links: list[Link] = []\n for raw in parsed.internal_links:\n normalized = deps.normalizer.normalize(raw)\n target = _upsert_url(deps.urls, normalized, now=ctx.now, is_internal=True)\n links.append(_make_link(ctx, normalized.url, _id(target), is_internal=True))\n deps.queue.enqueue(ctx.crawl_id, normalized)\n for raw in parsed.external_links:\n normalized = deps.normalizer.normalize(raw)\n links.append(_make_link(ctx, normalized.url, None, is_internal=False))\n deps.links.save_many(links)\n\n\ndef _persist_images(repo: ImageRepositoryPort, page_id: int, parsed: ParsedDocument) -> None:\n imgs = [\n Image(id=None, page_id=page_id, src=s, alt=a, title=t, width=w, height=h,\n loading_attr=None, decoding_attr=None, http_status=None, content_length=None)\n for s, a, t, w, h in parsed.image_specs\n ]\n repo.save_many(imgs)\n\n\ndef _persist_structured_data(\n deps: ProcessUrlDeps, page_id: int, html: str, *, base_url: str,\n) -> None:\n items = deps.sd_parser.parse(html, base_url=base_url)\n sd = [\n StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n for f, t, p in items\n ]\n deps.structured.save_many(sd)\n\n\ndef _id(entity: Page | Url) -> int:\n if entity.id is None:\n raise RuntimeError(f\"{type(entity).__name__} has no id after save\")\n return entity.id\n\n\ndef _make_link(\n ctx: _PageContext, target_url: str, target_url_id: int | None, *, is_internal: bool,\n) -> Link:\n return Link(\n id=None, page_id=ctx.page_id, source_url_id=ctx.source_url_id, target_url=target_url,\n target_url_id=target_url_id, anchor_text=None, rel=frozenset(),\n is_internal=is_internal, is_external=not is_internal, position_hint=None,\n )\n",
"structuredPatch": [
{
"oldStart": 45,
"oldLines": 10,
"newStart": 45,
"newLines": 10,
"lines": [
" ",
" ",
" @dataclass(frozen=True, slots=True)",
"-class _LinksContext:",
"+class _PageContext:",
"+ crawl_id: int",
" page_id: int",
" source_url_id: int",
"- crawl_id: int",
" now: datetime",
" ",
" "
]
},
{
"oldStart": 59,
"oldLines": 100,
"newStart": 59,
"newLines": 69,
"lines": [
" async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:",
" now = self._d.clock.now()",
" fetch = await self._d.fetcher.fetch(url)",
"- parsed = self._d.html_parser.parse(",
"- fetch.body.decode(\"utf-8\", errors=\"replace\"),",
"- base_url=url.url,",
"- )",
"- url_entity = self._upsert_url(url, now=now, is_internal=True)",
"+ html = fetch.body.decode(\"utf-8\", errors=\"replace\")",
"+ parsed = self._d.html_parser.parse(html, base_url=url.url)",
"+ url_entity = _upsert_url(self._d.urls, url, now=now, is_internal=True)",
" extracted = self._d.extractor.execute(",
" ExtractInput(",
"- crawl_id=crawl_id,",
"- url_id=_id(url_entity),",
"- fetched_at=now,",
"- render_mode=RenderMode.HTTP,",
"- fetch=fetch,",
"- parsed=parsed,",
"- page_url=url.url,",
"+ crawl_id=crawl_id, url_id=_id(url_entity), fetched_at=now,",
"+ render_mode=RenderMode.HTTP, fetch=fetch, parsed=parsed, page_url=url.url,",
" ),",
" )",
" page = self._d.pages.save(extracted.page)",
"- ctx = _LinksContext(",
"- page_id=_id(page), source_url_id=_id(url_entity), crawl_id=crawl_id, now=now",
"- )",
"- self._persist_links(ctx, parsed=parsed)",
"- self._persist_images(page_id=_id(page), parsed=parsed)",
"- self._d.hreflang.save_many(_id(page), parsed.hreflang)",
"- self._persist_structured_data(page_id=_id(page), html=fetch.body, base_url=url.url)",
"+ ctx = _PageContext(crawl_id, _id(page), _id(url_entity), now)",
"+ _persist_links(self._d, ctx, parsed)",
"+ _persist_images(self._d.images, ctx.page_id, parsed)",
"+ self._d.hreflang.save_many(ctx.page_id, parsed.hreflang)",
"+ _persist_structured_data(self._d, ctx.page_id, html, base_url=url.url)",
" self._d.queue.mark_done(crawl_id, url)",
" return page",
" ",
"- def _upsert_url(self, normalized: NormalizedUrl, *, now: datetime, is_internal: bool) -> Url:",
"- return self._d.urls.upsert(",
"- Url(",
"- id=None,",
"- normalized=normalized,",
"- is_internal=is_internal,",
"- first_seen_at=now,",
"- last_seen_at=now,",
"- )",
"- )",
" ",
"- def _persist_links(self, ctx: _LinksContext, *, parsed: ParsedDocument) -> None:",
"- links: list[Link] = []",
"- for raw in parsed.internal_links:",
"- normalized = self._d.normalizer.normalize(raw)",
"- target = self._upsert_url(normalized, now=ctx.now, is_internal=True)",
"- links.append(",
"- _make_link(",
"- ctx.page_id,",
"- ctx.source_url_id,",
"- normalized.url,",
"- target_url_id=_id(target),",
"- is_internal=True,",
"- )",
"- )",
"- self._d.queue.enqueue(ctx.crawl_id, normalized)",
"- for raw in parsed.external_links:",
"- normalized = self._d.normalizer.normalize(raw)",
"- links.append(",
"- _make_link(",
"- ctx.page_id,",
"- ctx.source_url_id,",
"- normalized.url,",
"- target_url_id=None,",
"- is_internal=False,",
"- )",
"- )",
"- self._d.links.save_many(links)",
"+def _upsert_url(",
"+ repo: UrlRepositoryPort, normalized: NormalizedUrl, *, now: datetime, is_internal: bool,",
"+) -> Url:",
"+ return repo.upsert(",
"+ Url(",
"+ id=None, normalized=normalized, is_internal=is_internal,",
"+ first_seen_at=now, last_seen_at=now,",
"+ ),",
"+ )",
" ",
"- def _persist_images(self, *, page_id: int, parsed: ParsedDocument) -> None:",
"- imgs = [",
"- Image(",
"- id=None,",
"- page_id=page_id,",
"- src=s,",
"- alt=a,",
"- title=t,",
"- width=w,",
"- height=h,",
"- loading_attr=None,",
"- decoding_attr=None,",
"- http_status=None,",
"- content_length=None,",
"- )",
"- for s, a, t, w, h in parsed.image_specs",
"- ]",
"- self._d.images.save_many(imgs)",
" ",
"- def _persist_structured_data(self, *, page_id: int, html: bytes, base_url: str) -> None:",
"- items = self._d.sd_parser.parse(html.decode(\"utf-8\", errors=\"replace\"), base_url=base_url)",
"- sd = [",
"- StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)",
"- for f, t, p in items",
"- ]",
"- self._d.structured.save_many(sd)",
"+def _persist_links(deps: ProcessUrlDeps, ctx: _PageContext, parsed: ParsedDocument) -> None:",
"+ links: list[Link] = []",
"+ for raw in parsed.internal_links:",
"+ normalized = deps.normalizer.normalize(raw)",
"+ target = _upsert_url(deps.urls, normalized, now=ctx.now, is_internal=True)",
"+ links.append(_make_link(ctx, normalized.url, _id(target), is_internal=True))",
"+ deps.queue.enqueue(ctx.crawl_id, normalized)",
"+ for raw in parsed.external_links:",
"+ normalized = deps.normalizer.normalize(raw)",
"+ links.append(_make_link(ctx, normalized.url, None, is_internal=False))",
"+ deps.links.save_many(links)",
" ",
" ",
"+def _persist_images(repo: ImageRepositoryPort, page_id: int, parsed: ParsedDocument) -> None:",
"+ imgs = [",
"+ Image(id=None, page_id=page_id, src=s, alt=a, title=t, width=w, height=h,",
"+ loading_attr=None, decoding_attr=None, http_status=None, content_length=None)",
"+ for s, a, t, w, h in parsed.image_specs",
"+ ]",
"+ repo.save_many(imgs)",
"+",
"+",
"+def _persist_structured_data(",
"+ deps: ProcessUrlDeps, page_id: int, html: str, *, base_url: str,",
"+) -> None:",
"+ items = deps.sd_parser.parse(html, base_url=base_url)",
"+ sd = [",
"+ StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)",
"+ for f, t, p in items",
"+ ]",
"+ deps.structured.save_many(sd)",
"+",
"+",
" def _id(entity: Page | Url) -> int:",
" if entity.id is None:",
" raise RuntimeError(f\"{type(entity).__name__} has no id after save\")"
]
},
{
"oldStart": 160,
"oldLines": 22,
"newStart": 129,
"newLines": 10,
"lines": [
" ",
" ",
" def _make_link(",
"- page_id: int,",
"- source_url_id: int,",
"- target_url: str,",
"- *,",
"- target_url_id: int | None,",
"- is_internal: bool,",
"+ ctx: _PageContext, target_url: str, target_url_id: int | None, *, is_internal: bool,",
" ) -> Link:",
" return Link(",
"- id=None,",
"- page_id=page_id,",
"- source_url_id=source_url_id,",
"- target_url=target_url,",
"- target_url_id=target_url_id,",
"- anchor_text=None,",
"- rel=frozenset(),",
"- is_internal=is_internal,",
"- is_external=not is_internal,",
"- position_hint=None,",
"+ id=None, page_id=ctx.page_id, source_url_id=ctx.source_url_id, target_url=target_url,",
"+ target_url_id=target_url_id, anchor_text=None, rel=frozenset(),",
"+ is_internal=is_internal, is_external=not is_internal, position_hint=None,",
" )"
]
}
],
"originalFile": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.application.use_cases.extract_seo_data import ExtractInput, ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n fetcher: HttpFetcherPort\n html_parser: HtmlParserPort\n sd_parser: StructuredDataParserPort\n pages: PageRepositoryPort\n urls: UrlRepositoryPort\n links: LinkRepositoryPort\n images: ImageRepositoryPort\n structured: StructuredDataRepositoryPort\n hreflang: HreflangRepositoryPort\n queue: UrlQueuePort\n classifier: LinkClassifier\n normalizer: UrlNormalizer\n extractor: ExtractSeoDataUseCase\n clock: ClockPort\n\n\n@dataclass(frozen=True, slots=True)\nclass _LinksContext:\n page_id: int\n source_url_id: int\n crawl_id: int\n now: datetime\n\n\nclass ProcessUrlUseCase:\n def __init__(self, deps: ProcessUrlDeps) -> None:\n self._d = deps\n\n async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n now = self._d.clock.now()\n fetch = await self._d.fetcher.fetch(url)\n parsed = self._d.html_parser.parse(\n fetch.body.decode(\"utf-8\", errors=\"replace\"),\n base_url=url.url,\n )\n url_entity = self._upsert_url(url, now=now, is_internal=True)\n extracted = self._d.extractor.execute(\n ExtractInput(\n crawl_id=crawl_id,\n url_id=_id(url_entity),\n fetched_at=now,\n render_mode=RenderMode.HTTP,\n fetch=fetch,\n parsed=parsed,\n page_url=url.url,\n ),\n )\n page = self._d.pages.save(extracted.page)\n ctx = _LinksContext(\n page_id=_id(page), source_url_id=_id(url_entity), crawl_id=crawl_id, now=now\n )\n self._persist_links(ctx, parsed=parsed)\n self._persist_images(page_id=_id(page), parsed=parsed)\n self._d.hreflang.save_many(_id(page), parsed.hreflang)\n self._persist_structured_data(page_id=_id(page), html=fetch.body, base_url=url.url)\n self._d.queue.mark_done(crawl_id, url)\n return page\n\n def _upsert_url(self, normalized: NormalizedUrl, *, now: datetime, is_internal: bool) -> Url:\n return self._d.urls.upsert(\n Url(\n id=None,\n normalized=normalized,\n is_internal=is_internal,\n first_seen_at=now,\n last_seen_at=now,\n )\n )\n\n def _persist_links(self, ctx: _LinksContext, *, parsed: ParsedDocument) -> None:\n links: list[Link] = []\n for raw in parsed.internal_links:\n normalized = self._d.normalizer.normalize(raw)\n target = self._upsert_url(normalized, now=ctx.now, is_internal=True)\n links.append(\n _make_link(\n ctx.page_id,\n ctx.source_url_id,\n normalized.url,\n target_url_id=_id(target),\n is_internal=True,\n )\n )\n self._d.queue.enqueue(ctx.crawl_id, normalized)\n for raw in parsed.external_links:\n normalized = self._d.normalizer.normalize(raw)\n links.append(\n _make_link(\n ctx.page_id,\n ctx.source_url_id,\n normalized.url,\n target_url_id=None,\n is_internal=False,\n )\n )\n self._d.links.save_many(links)\n\n def _persist_images(self, *, page_id: int, parsed: ParsedDocument) -> None:\n imgs = [\n Image(\n id=None,\n page_id=page_id,\n src=s,\n alt=a,\n title=t,\n width=w,\n height=h,\n loading_attr=None,\n decoding_attr=None,\n http_status=None,\n content_length=None,\n )\n for s, a, t, w, h in parsed.image_specs\n ]\n self._d.images.save_many(imgs)\n\n def _persist_structured_data(self, *, page_id: int, html: bytes, base_url: str) -> None:\n items = self._d.sd_parser.parse(html.decode(\"utf-8\", errors=\"replace\"), base_url=base_url)\n sd = [\n StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n for f, t, p in items\n ]\n self._d.structured.save_many(sd)\n\n\ndef _id(entity: Page | Url) -> int:\n if entity.id is None:\n raise RuntimeError(f\"{type(entity).__name__} has no id after save\")\n return entity.id\n\n\ndef _make_link(\n page_id: int,\n source_url_id: int,\n target_url: str,\n *,\n target_url_id: int | None,\n is_internal: bool,\n) -> Link:\n return Link(\n id=None,\n page_id=page_id,\n source_url_id=source_url_id,\n target_url=target_url,\n target_url_id=target_url_id,\n anchor_text=None,\n rel=frozenset(),\n is_internal=is_internal,\n is_external=not is_internal,\n position_hint=None,\n )\n",
"userModified": false
}
}