{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/extract_seo_data.py",
"content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.domain.entities.page import Page\nfrom src.domain.ports.html_parser_port import ParsedDocument\nfrom src.domain.ports.http_fetcher_port import FetchResult\nfrom src.domain.services.quality_flag_calculator import QualityFlagCalculator\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.http_response import HttpResponse\nfrom src.domain.value_objects.http_status import HttpStatus\nfrom src.domain.value_objects.performance_metrics import PerformanceMetrics\nfrom src.domain.value_objects.storage_paths import StoragePaths\n\n\n@dataclass(frozen=True, slots=True)\nclass ExtractedPage:\n page: Page\n parsed: ParsedDocument\n\n\n@dataclass(frozen=True, slots=True)\nclass ExtractInput:\n crawl_id: int\n url_id: int\n fetched_at: datetime\n render_mode: RenderMode\n fetch: FetchResult\n parsed: ParsedDocument\n page_url: str\n html_raw_path: str | None = None\n screenshot_path: str | None = None\n\n\nclass ExtractSeoDataUseCase:\n def __init__(self, *, calculator: QualityFlagCalculator) -> None:\n self._calculator = calculator\n\n def execute(self, command: ExtractInput) -> ExtractedPage:\n http = _build_http_response(command.fetch)\n flags = self._calculator.calculate(\n head=command.parsed.head,\n h1_count=command.parsed.body.h1_count,\n http_status=http.status,\n page_url=command.page_url,\n )\n page = Page(\n id=None,\n crawl_id=command.crawl_id,\n url_id=command.url_id,\n fetched_at=command.fetched_at,\n render_mode=command.render_mode,\n http=http,\n head=command.parsed.head,\n og=command.parsed.og,\n twitter=command.parsed.twitter,\n body=command.parsed.body,\n performance=_empty_performance(),\n storage=StoragePaths(\n screenshot_path=command.screenshot_path,\n html_raw_path=command.html_raw_path,\n ),\n quality_flags=flags,\n )\n return ExtractedPage(page=page, parsed=command.parsed)\n\n\ndef _build_http_response(fetch: FetchResult) -> HttpResponse:\n status = HttpStatus(fetch.status_code) if fetch.status_code else None\n return HttpResponse(\n status=status,\n final_url=fetch.final_url,\n redirect_chain=fetch.redirect_chain,\n response_time_ms=fetch.response_time_ms,\n ttfb_ms=fetch.ttfb_ms,\n content_type=fetch.headers.get(\"content-type\"),\n content_length=_safe_int(fetch.headers.get(\"content-length\")),\n content_encoding=fetch.headers.get(\"content-encoding\"),\n )\n\n\ndef _safe_int(value: str | None) -> int | None:\n if value is None:\n return None\n try:\n return int(value)\n except ValueError:\n return None\n\n\ndef _empty_performance() -> PerformanceMetrics:\n return PerformanceMetrics(None, None, None, None, None)\n"
}
}
{
"tool_response": {
"type": "update",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/application\/use_cases\/extract_seo_data.py",
"content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.domain.entities.page import Page\nfrom src.domain.ports.html_parser_port import ParsedDocument\nfrom src.domain.ports.http_fetcher_port import FetchResult\nfrom src.domain.services.quality_flag_calculator import QualityFlagCalculator\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.http_response import HttpResponse\nfrom src.domain.value_objects.http_status import HttpStatus\nfrom src.domain.value_objects.performance_metrics import PerformanceMetrics\nfrom src.domain.value_objects.storage_paths import StoragePaths\n\n\n@dataclass(frozen=True, slots=True)\nclass ExtractedPage:\n page: Page\n parsed: ParsedDocument\n\n\n@dataclass(frozen=True, slots=True)\nclass ExtractInput:\n crawl_id: int\n url_id: int\n fetched_at: datetime\n render_mode: RenderMode\n fetch: FetchResult\n parsed: ParsedDocument\n page_url: str\n html_raw_path: str | None = None\n screenshot_path: str | None = None\n\n\nclass ExtractSeoDataUseCase:\n def __init__(self, *, calculator: QualityFlagCalculator) -> None:\n self._calculator = calculator\n\n def execute(self, command: ExtractInput) -> ExtractedPage:\n http = _build_http_response(command.fetch)\n flags = self._calculator.calculate(\n head=command.parsed.head,\n h1_count=command.parsed.body.h1_count,\n http_status=http.status,\n page_url=command.page_url,\n )\n page = Page(\n id=None,\n crawl_id=command.crawl_id,\n url_id=command.url_id,\n fetched_at=command.fetched_at,\n render_mode=command.render_mode,\n http=http,\n head=command.parsed.head,\n og=command.parsed.og,\n twitter=command.parsed.twitter,\n body=command.parsed.body,\n performance=_empty_performance(),\n storage=StoragePaths(\n screenshot_path=command.screenshot_path,\n html_raw_path=command.html_raw_path,\n ),\n quality_flags=flags,\n )\n return ExtractedPage(page=page, parsed=command.parsed)\n\n\ndef _build_http_response(fetch: FetchResult) -> HttpResponse:\n status = HttpStatus(fetch.status_code) if fetch.status_code else None\n return HttpResponse(\n status=status,\n final_url=fetch.final_url,\n redirect_chain=fetch.redirect_chain,\n response_time_ms=fetch.response_time_ms,\n ttfb_ms=fetch.ttfb_ms,\n content_type=fetch.headers.get(\"content-type\"),\n content_length=_safe_int(fetch.headers.get(\"content-length\")),\n content_encoding=fetch.headers.get(\"content-encoding\"),\n )\n\n\ndef _safe_int(value: str | None) -> int | None:\n if value is None:\n return None\n try:\n return int(value)\n except ValueError:\n return None\n\n\ndef _empty_performance() -> PerformanceMetrics:\n return PerformanceMetrics(None, None, None, None, None)\n",
"structuredPatch": [
{
"oldStart": 20,
"oldLines": 46,
"newStart": 20,
"newLines": 50,
"lines": [
" parsed: ParsedDocument",
" ",
" ",
"+@dataclass(frozen=True, slots=True)",
"+class ExtractInput:",
"+ crawl_id: int",
"+ url_id: int",
"+ fetched_at: datetime",
"+ render_mode: RenderMode",
"+ fetch: FetchResult",
"+ parsed: ParsedDocument",
"+ page_url: str",
"+ html_raw_path: str | None = None",
"+ screenshot_path: str | None = None",
"+",
"+",
" class ExtractSeoDataUseCase:",
" def __init__(self, *, calculator: QualityFlagCalculator) -> None:",
" self._calculator = calculator",
" ",
"- def execute(",
"- self,",
"- *,",
"- crawl_id: int,",
"- url_id: int,",
"- fetched_at: datetime,",
"- render_mode: RenderMode,",
"- fetch: FetchResult,",
"- parsed: ParsedDocument,",
"- page_url: str,",
"- html_raw_path: str | None = None,",
"- screenshot_path: str | None = None,",
"- ) -> ExtractedPage:",
"- http = _build_http_response(fetch)",
"+ def execute(self, command: ExtractInput) -> ExtractedPage:",
"+ http = _build_http_response(command.fetch)",
" flags = self._calculator.calculate(",
"- head=parsed.head,",
"- h1_count=parsed.body.h1_count,",
"+ head=command.parsed.head,",
"+ h1_count=command.parsed.body.h1_count,",
" http_status=http.status,",
"- page_url=page_url,",
"+ page_url=command.page_url,",
" )",
" page = Page(",
" id=None,",
"- crawl_id=crawl_id,",
"- url_id=url_id,",
"- fetched_at=fetched_at,",
"- render_mode=render_mode,",
"+ crawl_id=command.crawl_id,",
"+ url_id=command.url_id,",
"+ fetched_at=command.fetched_at,",
"+ render_mode=command.render_mode,",
" http=http,",
"- head=parsed.head,",
"- og=parsed.og,",
"- twitter=parsed.twitter,",
"- body=parsed.body,",
"+ head=command.parsed.head,",
"+ og=command.parsed.og,",
"+ twitter=command.parsed.twitter,",
"+ body=command.parsed.body,",
" performance=_empty_performance(),",
"- storage=StoragePaths(screenshot_path=screenshot_path, html_raw_path=html_raw_path),",
"+ storage=StoragePaths(",
"+ screenshot_path=command.screenshot_path,",
"+ html_raw_path=command.html_raw_path,",
"+ ),",
" quality_flags=flags,",
" )",
"- return ExtractedPage(page=page, parsed=parsed)",
"+ return ExtractedPage(page=page, parsed=command.parsed)",
" ",
" ",
" def _build_http_response(fetch: FetchResult) -> HttpResponse:"
]
}
],
"originalFile": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import datetime\n\nfrom src.domain.entities.page import Page\nfrom src.domain.ports.html_parser_port import ParsedDocument\nfrom src.domain.ports.http_fetcher_port import FetchResult\nfrom src.domain.services.quality_flag_calculator import QualityFlagCalculator\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.domain.value_objects.http_response import HttpResponse\nfrom src.domain.value_objects.http_status import HttpStatus\nfrom src.domain.value_objects.performance_metrics import PerformanceMetrics\nfrom src.domain.value_objects.storage_paths import StoragePaths\n\n\n@dataclass(frozen=True, slots=True)\nclass ExtractedPage:\n page: Page\n parsed: ParsedDocument\n\n\nclass ExtractSeoDataUseCase:\n def __init__(self, *, calculator: QualityFlagCalculator) -> None:\n self._calculator = calculator\n\n def execute(\n self,\n *,\n crawl_id: int,\n url_id: int,\n fetched_at: datetime,\n render_mode: RenderMode,\n fetch: FetchResult,\n parsed: ParsedDocument,\n page_url: str,\n html_raw_path: str | None = None,\n screenshot_path: str | None = None,\n ) -> ExtractedPage:\n http = _build_http_response(fetch)\n flags = self._calculator.calculate(\n head=parsed.head,\n h1_count=parsed.body.h1_count,\n http_status=http.status,\n page_url=page_url,\n )\n page = Page(\n id=None,\n crawl_id=crawl_id,\n url_id=url_id,\n fetched_at=fetched_at,\n render_mode=render_mode,\n http=http,\n head=parsed.head,\n og=parsed.og,\n twitter=parsed.twitter,\n body=parsed.body,\n performance=_empty_performance(),\n storage=StoragePaths(screenshot_path=screenshot_path, html_raw_path=html_raw_path),\n quality_flags=flags,\n )\n return ExtractedPage(page=page, parsed=parsed)\n\n\ndef _build_http_response(fetch: FetchResult) -> HttpResponse:\n status = HttpStatus(fetch.status_code) if fetch.status_code else None\n return HttpResponse(\n status=status,\n final_url=fetch.final_url,\n redirect_chain=fetch.redirect_chain,\n response_time_ms=fetch.response_time_ms,\n ttfb_ms=fetch.ttfb_ms,\n content_type=fetch.headers.get(\"content-type\"),\n content_length=_safe_int(fetch.headers.get(\"content-length\")),\n content_encoding=fetch.headers.get(\"content-encoding\"),\n )\n\n\ndef _safe_int(value: str | None) -> int | None:\n if value is None:\n return None\n try:\n return int(value)\n except ValueError:\n return None\n\n\ndef _empty_performance() -> PerformanceMetrics:\n return PerformanceMetrics(None, None, None, None, None)\n",
"userModified": false
}
}