{
"tool_response": {
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/fetchers\/playwright_fetcher.py",
"oldString": "from src.domain.entities.js_error import JsError\nfrom src.domain.entities.resource import Resource\nfrom src.domain.ports.browser_fetcher_port import BrowserFetcherPort, BrowserFetchResult\nfrom src.domain.value_objects.enums import JsErrorSeverity\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\nfrom src.domain.value_objects.performance_metrics import PerformanceMetrics\nfrom src.infrastructure.config.constants import PLAYWRIGHT_TIMEOUT_SEC, USER_AGENT\n\n_MS_PER_SECOND = 1000",
"newString": "from src.domain.entities.js_error import JsError\nfrom src.domain.entities.resource import Resource\nfrom src.domain.ports.browser_fetcher_port import BrowserFetcherPort, BrowserFetchResult\nfrom src.domain.value_objects.enums import JsErrorSeverity\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\nfrom src.domain.value_objects.performance_metrics import PerformanceMetrics\nfrom src.infrastructure.config.constants import PLAYWRIGHT_TIMEOUT_SEC, USER_AGENT\nfrom src.infrastructure.fetchers.web_vitals_collector import (\n INIT_SCRIPT,\n READ_SCRIPT,\n parse_vitals,\n)\n\n_MS_PER_SECOND = 1000\n_WEB_VITALS_SETTLE_MS = 2000",
"originalFile": "from __future__ import annotations\n\nimport contextlib\nimport time\nfrom dataclasses import dataclass, field\nfrom typing import Self\n\nfrom playwright.async_api import Browser, Page, Response, async_playwright\n\nfrom src.domain.entities.js_error import JsError\nfrom src.domain.entities.resource import Resource\nfrom src.domain.ports.browser_fetcher_port import BrowserFetcherPort, BrowserFetchResult\nfrom src.domain.value_objects.enums import JsErrorSeverity\nfrom src.domain.value_objects.normalized_url import NormalizedUrl\nfrom src.domain.value_objects.performance_metrics import PerformanceMetrics\nfrom src.infrastructure.config.constants import PLAYWRIGHT_TIMEOUT_SEC, USER_AGENT\n\n_MS_PER_SECOND = 1000\n\n\n@dataclass\nclass _FetchTelemetry:\n js_errors: list[JsError] = field(default_factory=list)\n resources: list[Resource] = field(default_factory=list)\n console_warnings: list[JsError] = field(default_factory=list)\n\n\nclass PlaywrightFetcher(BrowserFetcherPort):\n def __init__(self, browser: Browser) -> None:\n self._browser = browser\n\n async def fetch(\n self,\n url: NormalizedUrl,\n *,\n screenshot: bool = False,\n ) -> BrowserFetchResult:\n context = await self._browser.new_context(user_agent=USER_AGENT)\n page = await context.new_page()\n telemetry = _FetchTelemetry()\n _attach_listeners(page, telemetry)\n start = time.perf_counter()\n try:\n response = await page.goto(url.url, timeout=PLAYWRIGHT_TIMEOUT_SEC * _MS_PER_SECOND)\n html = await page.content()\n status = response.status if response else 0\n final_url = response.url if response else url.url\n headers = _headers_of(response)\n shot = await page.screenshot(full_page=True) if screenshot else None\n dom_size = await page.evaluate(\"document.getElementsByTagName('*').length\")\n finally:\n render_ms = int((time.perf_counter() - start) * _MS_PER_SECOND)\n await context.close()\n return BrowserFetchResult(\n final_url=final_url,\n status_code=status,\n html=html,\n headers=headers,\n js_errors=tuple(telemetry.js_errors + telemetry.console_warnings),\n resources=tuple(telemetry.resources),\n performance=PerformanceMetrics(\n dom_node_count=dom_size,\n render_time_ms=render_ms,\n lcp_ms=None,\n cls=None,\n tbt_ms=None,\n ),\n render_time_ms=render_ms,\n screenshot_png=shot,\n )\n\n @classmethod\n async def create(cls) -> Self:\n playwright = await async_playwright().start()\n browser = await playwright.chromium.launch(headless=True)\n instance = cls(browser)\n instance._playwright = playwright # type: ignore[attr-defined]\n return instance\n\n async def close(self) -> None:\n await self._browser.close()\n pw = getattr(self, \"_playwright\", None)\n if pw is not None:\n await pw.stop()\n\n\ndef _attach_listeners(page: Page, tel: _FetchTelemetry) -> None:\n page.on(\n \"pageerror\",\n lambda exc: tel.js_errors.append(\n JsError(\n id=None,\n page_id=0,\n severity=JsErrorSeverity.ERROR,\n message=str(exc),\n source=None,\n line_no=None,\n col_no=None,\n ),\n ),\n )\n page.on(\"console\", lambda msg: _on_console(msg, tel))\n page.on(\"response\", lambda resp: _on_response(resp, tel))\n\n\ndef _on_console(msg: object, tel: _FetchTelemetry) -> None:\n msg_type = getattr(msg, \"type\", \"\")\n if msg_type not in {\"error\", \"warning\"}:\n return\n severity = JsErrorSeverity.ERROR if msg_type == \"error\" else JsErrorSeverity.WARNING\n tel.console_warnings.append(\n JsError(\n id=None,\n page_id=0,\n severity=severity,\n message=str(getattr(msg, \"text\", \"\")),\n source=None,\n line_no=None,\n col_no=None,\n ),\n )\n\n\ndef _on_response(resp: object, tel: _FetchTelemetry) -> None:\n with contextlib.suppress(AttributeError, TypeError):\n tel.resources.append(\n Resource(\n id=None,\n page_id=0,\n resource_type=str(\n getattr(getattr(resp, \"request\", None), \"resource_type\", \"other\")\n ),\n url=str(getattr(resp, \"url\", \"\")),\n http_status=int(getattr(resp, \"status\", 0)),\n content_length=None,\n duration_ms=None,\n from_cache=None,\n )\n )\n\n\ndef _headers_of(response: Response | None) -> dict[str, str]:\n if response is None:\n return {}\n return {k.lower(): v for k, v in response.headers.items()}\n",
"structuredPatch": [
{
"oldStart": 14,
"oldLines": 8,
"newStart": 14,
"newLines": 14,
"lines": [
" from src.domain.value_objects.normalized_url import NormalizedUrl",
" from src.domain.value_objects.performance_metrics import PerformanceMetrics",
" from src.infrastructure.config.constants import PLAYWRIGHT_TIMEOUT_SEC, USER_AGENT",
"+from src.infrastructure.fetchers.web_vitals_collector import (",
"+ INIT_SCRIPT,",
"+ READ_SCRIPT,",
"+ parse_vitals,",
"+)",
" ",
" _MS_PER_SECOND = 1000",
"+_WEB_VITALS_SETTLE_MS = 2000",
" ",
" ",
" @dataclass"
]
}
],
"userModified": false,
"replaceAll": false
}
}