{
"tool_response": {
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/process_url.py",
"oldString": "from src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer",
"newString": "from src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.page_token_repository_port import PageTokenRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.text_tokenizer import TextTokenizer\nfrom src.domain.services.url_normalizer import UrlNormalizer",
"originalFile": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.application.use_cases.extract_seo_data import ExtractInput, ExtractSeoDataUseCase\nfrom src.domain.entities.image import Image\nfrom src.domain.entities.link import Link\nfrom src.domain.entities.page import Page\nfrom src.domain.entities.structured_data_item import StructuredDataItem\nfrom src.domain.entities.url import Url\nfrom src.domain.ports.clock_port import ClockPort\nfrom src.domain.ports.hreflang_repository_port import HreflangRepositoryPort\nfrom src.domain.ports.html_parser_port import HtmlParserPort, ParsedDocument\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.ports.image_repository_port import ImageRepositoryPort\nfrom src.domain.ports.link_repository_port import LinkRepositoryPort\nfrom src.domain.ports.page_repository_port import PageRepositoryPort\nfrom src.domain.ports.structured_data_parser_port import StructuredDataParserPort\nfrom src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort\nfrom src.domain.ports.url_queue_port import UrlQueuePort\nfrom src.domain.ports.url_repository_port import UrlRepositoryPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\n\n\n@dataclass(frozen=True, slots=True)\nclass ProcessUrlDeps:\n fetcher: HttpFetcherPort\n html_parser: HtmlParserPort\n sd_parser: StructuredDataParserPort\n pages: PageRepositoryPort\n urls: UrlRepositoryPort\n links: LinkRepositoryPort\n images: ImageRepositoryPort\n structured: StructuredDataRepositoryPort\n hreflang: HreflangRepositoryPort\n queue: UrlQueuePort\n classifier: LinkClassifier\n normalizer: UrlNormalizer\n extractor: ExtractSeoDataUseCase\n clock: ClockPort\n render_mode: RenderMode = RenderMode.HTTP\n\n\n@dataclass(frozen=True, slots=True)\nclass _PageContext:\n crawl_id: int\n page_id: int\n source_url_id: int\n now: datetime\n\n\nclass ProcessUrlUseCase:\n def __init__(self, deps: ProcessUrlDeps) -> None:\n self._d = deps\n\n async def execute(self, *, crawl_id: int, url: NormalizedUrl) -> Page:\n now = self._d.clock.now()\n fetch = await self._d.fetcher.fetch(url)\n html = fetch.body.decode(\"utf-8\", errors=\"replace\")\n parsed = self._d.html_parser.parse(html, base_url=url.url)\n url_entity = _upsert_url(self._d.urls, url, now=now, is_internal=True)\n extracted = self._d.extractor.execute(\n ExtractInput(\n crawl_id=crawl_id,\n url_id=_id(url_entity),\n fetched_at=now,\n render_mode=self._d.render_mode,\n fetch=fetch,\n parsed=parsed,\n page_url=url.url,\n ),\n )\n page = self._d.pages.save(extracted.page)\n ctx = _PageContext(crawl_id, _id(page), _id(url_entity), now)\n _persist_links(self._d, ctx, parsed)\n _persist_images(self._d.images, ctx.page_id, parsed)\n self._d.hreflang.save_many(ctx.page_id, parsed.hreflang)\n _persist_structured_data(self._d, ctx.page_id, html, base_url=url.url)\n self._d.queue.mark_done(crawl_id, url)\n return page\n\n\ndef _upsert_url(\n repo: UrlRepositoryPort,\n normalized: NormalizedUrl,\n *,\n now: datetime,\n is_internal: bool,\n) -> Url:\n return repo.upsert(\n Url(\n id=None,\n normalized=normalized,\n is_internal=is_internal,\n first_seen_at=now,\n last_seen_at=now,\n ),\n )\n\n\ndef _persist_links(deps: ProcessUrlDeps, ctx: _PageContext, parsed: ParsedDocument) -> None:\n links: list[Link] = []\n for raw, anchor in parsed.internal_links:\n normalized = deps.normalizer.normalize(raw)\n target = _upsert_url(deps.urls, normalized, now=ctx.now, is_internal=True)\n links.append(_make_link(ctx, normalized.url, _id(target), anchor, is_internal=True))\n deps.queue.enqueue(ctx.crawl_id, normalized)\n for raw, anchor in parsed.external_links:\n normalized = deps.normalizer.normalize(raw)\n links.append(_make_link(ctx, normalized.url, None, anchor, is_internal=False))\n deps.links.save_many(links)\n\n\ndef _persist_images(repo: ImageRepositoryPort, page_id: int, parsed: ParsedDocument) -> None:\n imgs = [\n Image(\n id=None,\n page_id=page_id,\n src=s,\n alt=a,\n title=t,\n width=w,\n height=h,\n loading_attr=None,\n decoding_attr=None,\n http_status=None,\n content_length=None,\n )\n for s, a, t, w, h in parsed.image_specs\n ]\n repo.save_many(imgs)\n\n\ndef _persist_structured_data(\n deps: ProcessUrlDeps,\n page_id: int,\n html: str,\n *,\n base_url: str,\n) -> None:\n items = deps.sd_parser.parse(html, base_url=base_url)\n sd = [\n StructuredDataItem(id=None, page_id=page_id, format=f, schema_type=t, payload=p)\n for f, t, p in items\n ]\n deps.structured.save_many(sd)\n\n\ndef _id(entity: Page | Url) -> int:\n if entity.id is None:\n raise RuntimeError(f\"{type(entity).__name__} has no id after save\")\n return entity.id\n\n\ndef _make_link(\n ctx: _PageContext,\n target_url: str,\n target_url_id: int | None,\n anchor: str | None,\n *,\n is_internal: bool,\n) -> Link:\n return Link(\n id=None,\n page_id=ctx.page_id,\n source_url_id=ctx.source_url_id,\n target_url=target_url,\n target_url_id=target_url_id,\n anchor_text=anchor,\n rel=frozenset(),\n is_internal=is_internal,\n is_external=not is_internal,\n position_hint=None,\n )\n",
"structuredPatch": [
{
"oldStart": 16,
"oldLines": 11,
"newStart": 16,
"newLines": 13,
"lines": [
" from src.domain.ports.image_repository_port import ImageRepositoryPort",
" from src.domain.ports.link_repository_port import LinkRepositoryPort",
" from src.domain.ports.page_repository_port import PageRepositoryPort",
"+from src.domain.ports.page_token_repository_port import PageTokenRepositoryPort",
" from src.domain.ports.structured_data_parser_port import StructuredDataParserPort",
" from src.domain.ports.structured_data_repository_port import StructuredDataRepositoryPort",
" from src.domain.ports.url_queue_port import UrlQueuePort",
" from src.domain.ports.url_repository_port import UrlRepositoryPort",
" from src.domain.services.link_classifier import LinkClassifier",
"+from src.domain.services.text_tokenizer import TextTokenizer",
" from src.domain.services.url_normalizer import UrlNormalizer",
" from src.domain.value_objects.enums import RenderMode",
" from src.domain.value_objects.normalized_url import NormalizedUrl"
]
}
],
"userModified": false,
"replaceAll": false
}
}