{
"event": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/wiring.py",
"old_string": " extractor=self.extractor,\n clock=self.clock,\n render_mode=render_mode,\n )\n return ProcessUrlUseCase(deps)",
"new_string": " extractor=self.extractor,\n clock=self.clock,\n tokens=self.repos.tokens,\n tokenizer=self.tokenizer,\n render_mode=render_mode,\n )\n return ProcessUrlUseCase(deps)",
"replace_all": false
}
}
{
"tool_response": {
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/interfaces\/cli\/wiring.py",
"oldString": " extractor=self.extractor,\n clock=self.clock,\n render_mode=render_mode,\n )\n return ProcessUrlUseCase(deps)",
"newString": " extractor=self.extractor,\n clock=self.clock,\n tokens=self.repos.tokens,\n tokenizer=self.tokenizer,\n render_mode=render_mode,\n )\n return ProcessUrlUseCase(deps)",
"originalFile": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom datetime import timedelta\n\nfrom src.application.use_cases.check_external_link import CheckExternalLinkUseCase\nfrom src.application.use_cases.compare_crawls import CompareCrawlsUseCase\nfrom src.application.use_cases.extract_seo_data import ExtractSeoDataUseCase\nfrom src.application.use_cases.finalize_crawl import FinalizeCrawlUseCase\nfrom src.application.use_cases.generate_crawl_report import GenerateCrawlReportUseCase\nfrom src.application.use_cases.list_crawls import ListCrawlsUseCase\nfrom src.application.use_cases.process_url import ProcessUrlDeps, ProcessUrlUseCase\nfrom src.application.use_cases.recheck_external_links import RecheckExternalLinksUseCase\nfrom src.application.use_cases.start_crawl import StartCrawlUseCase\nfrom src.domain.ports.http_fetcher_port import HttpFetcherPort\nfrom src.domain.services.link_classifier import LinkClassifier\nfrom src.domain.services.quality_flag_calculator import QualityFlagCalculator\nfrom src.domain.services.url_normalizer import UrlNormalizer\nfrom src.domain.value_objects.enums import RenderMode\nfrom src.infrastructure.clock.system_clock import SystemClock\nfrom src.infrastructure.config.constants import EXTERNAL_CHECK_CACHE_HOURS\nfrom src.infrastructure.config.db_config import load_db_settings\nfrom src.infrastructure.fetchers.httpx_fetcher import HttpxFetcher\nfrom src.infrastructure.parsers.html_parser_bs4 import Bs4HtmlParser\nfrom src.infrastructure.parsers.structured_data_parser_extruct import (\n ExtructStructuredDataParser,\n)\nfrom src.infrastructure.persistence.connection import MariaDbConnectionFactory\nfrom src.infrastructure.persistence.crawl_error_repository import CrawlErrorRepository\nfrom src.infrastructure.persistence.crawl_repository import CrawlRepository\nfrom src.infrastructure.persistence.external_check_repository import ExternalCheckRepository\nfrom src.infrastructure.persistence.hreflang_repository import HreflangRepository\nfrom src.infrastructure.persistence.image_repository import ImageRepository\nfrom src.infrastructure.persistence.link_repository import LinkRepository\nfrom src.infrastructure.persistence.page_repository import PageRepository\nfrom src.infrastructure.persistence.page_token_repository import PageTokenRepository\nfrom src.infrastructure.persistence.structured_data_repository import (\n StructuredDataRepository,\n)\nfrom src.infrastructure.persistence.url_repository import UrlRepository\nfrom src.infrastructure.queue.db_url_queue import DbUrlQueue\n\n\n@dataclass(frozen=True, slots=True)\nclass Repositories:\n crawls: CrawlRepository\n urls: UrlRepository\n pages: PageRepository\n links: LinkRepository\n images: ImageRepository\n structured: StructuredDataRepository\n hreflang: HreflangRepository\n errors: CrawlErrorRepository\n external_checks: ExternalCheckRepository\n tokens: PageTokenRepository\n\n\ndef _build_repositories(conn: MariaDbConnectionFactory) -> Repositories:\n return Repositories(\n crawls=CrawlRepository(conn),\n urls=UrlRepository(conn),\n pages=PageRepository(conn),\n links=LinkRepository(conn),\n images=ImageRepository(conn),\n structured=StructuredDataRepository(conn),\n hreflang=HreflangRepository(conn),\n errors=CrawlErrorRepository(conn),\n external_checks=ExternalCheckRepository(conn),\n tokens=PageTokenRepository(conn),\n )\n\n\nclass Container:\n def __init__(self, conn: MariaDbConnectionFactory) -> None:\n self.conn = conn\n self.repos = _build_repositories(conn)\n self.clock = SystemClock()\n self.normalizer = UrlNormalizer(strip_tracking=False)\n self.queue = DbUrlQueue(conn)\n self.html_parser = Bs4HtmlParser()\n self.sd_parser = ExtructStructuredDataParser()\n self.calculator = QualityFlagCalculator()\n self.extractor = ExtractSeoDataUseCase(calculator=self.calculator)\n self.tokenizer = TextTokenizer()\n\n def start_crawl(self) -> StartCrawlUseCase:\n return StartCrawlUseCase(\n crawls=self.repos.crawls,\n urls=self.repos.urls,\n queue=self.queue,\n normalizer=self.normalizer,\n clock=self.clock,\n )\n\n def process_url(\n self,\n fetcher: HttpFetcherPort,\n *,\n registered_domain: str,\n render_mode: RenderMode = RenderMode.HTTP,\n ) -> ProcessUrlUseCase:\n deps = ProcessUrlDeps(\n fetcher=fetcher,\n html_parser=self.html_parser,\n sd_parser=self.sd_parser,\n pages=self.repos.pages,\n urls=self.repos.urls,\n links=self.repos.links,\n images=self.repos.images,\n structured=self.repos.structured,\n hreflang=self.repos.hreflang,\n queue=self.queue,\n classifier=LinkClassifier(registered_domain=registered_domain),\n normalizer=self.normalizer,\n extractor=self.extractor,\n clock=self.clock,\n render_mode=render_mode,\n )\n return ProcessUrlUseCase(deps)\n\n def finalize(self) -> FinalizeCrawlUseCase:\n return FinalizeCrawlUseCase(\n crawls=self.repos.crawls,\n errors=self.repos.errors,\n clock=self.clock,\n )\n\n def list_crawls(self) -> ListCrawlsUseCase:\n return ListCrawlsUseCase(crawls=self.repos.crawls)\n\n def report(self) -> GenerateCrawlReportUseCase:\n return GenerateCrawlReportUseCase(\n crawls=self.repos.crawls,\n pages=self.repos.pages,\n errors=self.repos.errors,\n )\n\n def diff(self) -> CompareCrawlsUseCase:\n return CompareCrawlsUseCase(pages=self.repos.pages, urls=self.repos.urls)\n\n def check_external(self, fetcher: HttpxFetcher) -> CheckExternalLinkUseCase:\n return CheckExternalLinkUseCase(\n fetcher=fetcher,\n checks=self.repos.external_checks,\n normalizer=self.normalizer,\n clock=self.clock,\n cache_ttl=timedelta(hours=EXTERNAL_CHECK_CACHE_HOURS),\n )\n\n def recheck_external(self, fetcher: HttpxFetcher) -> RecheckExternalLinksUseCase:\n return RecheckExternalLinksUseCase(\n pages=self.repos.pages,\n links=self.repos.links,\n checker=self.check_external(fetcher),\n )\n\n\ndef build_container(database: str = \"crawler\") -> Container:\n return Container(MariaDbConnectionFactory(load_db_settings(database)))\n",
"structuredPatch": [
{
"oldStart": 114,
"oldLines": 6,
"newStart": 114,
"newLines": 8,
"lines": [
" normalizer=self.normalizer,",
" extractor=self.extractor,",
" clock=self.clock,",
"+ tokens=self.repos.tokens,",
"+ tokenizer=self.tokenizer,",
" render_mode=render_mode,",
" )",
" return ProcessUrlUseCase(deps)"
]
}
],
"userModified": false,
"replaceAll": false
}
}