{
"event": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/parsers\/spacy_tokenizer.py",
"old_string": "from collections import Counter\nfrom typing import Self\n\nimport spacy # type: ignore[import-not-found]\nfrom spacy.language import Language # type: ignore[import-not-found]\n\nfrom src.domain.ports.tokenizer_port import TokenizerPort\nfrom src.domain.services.text_tokenizer import TokenFrequency",
"new_string": "from collections import Counter\nfrom typing import TYPE_CHECKING, Self\n\nimport spacy # type: ignore[import-not-found]\n\nfrom src.domain.ports.tokenizer_port import TokenizerPort\nfrom src.domain.services.text_tokenizer import TokenFrequency\n\nif TYPE_CHECKING:\n from spacy.language import Language # type: ignore[import-not-found]",
"replace_all": false
}
}
{
"tool_response": {
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/infrastructure\/parsers\/spacy_tokenizer.py",
"oldString": "from collections import Counter\nfrom typing import Self\n\nimport spacy # type: ignore[import-not-found]\nfrom spacy.language import Language # type: ignore[import-not-found]\n\nfrom src.domain.ports.tokenizer_port import TokenizerPort\nfrom src.domain.services.text_tokenizer import TokenFrequency",
"newString": "from collections import Counter\nfrom typing import TYPE_CHECKING, Self\n\nimport spacy # type: ignore[import-not-found]\n\nfrom src.domain.ports.tokenizer_port import TokenizerPort\nfrom src.domain.services.text_tokenizer import TokenFrequency\n\nif TYPE_CHECKING:\n from spacy.language import Language # type: ignore[import-not-found]",
"originalFile": "\"\"\"spaCy-based tokenizer with lemmatization (German de_core_news_sm).\"\"\"\n\nfrom __future__ import annotations\n\nfrom collections import Counter\nfrom typing import Self\n\nimport spacy # type: ignore[import-not-found]\nfrom spacy.language import Language # type: ignore[import-not-found]\n\nfrom src.domain.ports.tokenizer_port import TokenizerPort\nfrom src.domain.services.text_tokenizer import TokenFrequency\n\n_MODEL = \"de_core_news_sm\"\n_MIN_LEN = 3\n_MAX_LEN = 80\n\n\nclass SpacyTokenizer(TokenizerPort):\n def __init__(self, nlp: Language) -> None:\n self._nlp = nlp\n\n def tokenize(self, text: str) -> list[TokenFrequency]:\n doc = self._nlp(text or \"\", disable=[\"parser\", \"ner\", \"attribute_ruler\"])\n per_lemma: dict[str, Counter[str]] = {}\n for spacy_token in doc:\n if not spacy_token.is_alpha or spacy_token.is_stop:\n continue\n surface = spacy_token.text.lower()\n if not (_MIN_LEN <= len(surface) <= _MAX_LEN):\n continue\n lemma = (spacy_token.lemma_ or surface).lower()\n per_lemma.setdefault(lemma, Counter())[surface] += 1\n return [\n TokenFrequency(\n token=_dominant(surface_counts), count=sum(surface_counts.values()), lemma=lemma\n )\n for lemma, surface_counts in per_lemma.items()\n ]\n\n @classmethod\n def create(cls) -> Self:\n return cls(spacy.load(_MODEL, disable=[\"parser\", \"ner\", \"attribute_ruler\"]))\n\n\ndef _dominant(counter: Counter[str]) -> str:\n return counter.most_common(1)[0][0]\n",
"structuredPatch": [
{
"oldStart": 3,
"oldLines": 14,
"newStart": 3,
"newLines": 16,
"lines": [
" from __future__ import annotations",
" ",
" from collections import Counter",
"-from typing import Self",
"+from typing import TYPE_CHECKING, Self",
" ",
" import spacy # type: ignore[import-not-found]",
"-from spacy.language import Language # type: ignore[import-not-found]",
" ",
" from src.domain.ports.tokenizer_port import TokenizerPort",
" from src.domain.services.text_tokenizer import TokenFrequency",
" ",
"+if TYPE_CHECKING:",
"+ from spacy.language import Language # type: ignore[import-not-found]",
"+",
" _MODEL = \"de_core_news_sm\"",
" _MIN_LEN = 3",
" _MAX_LEN = 80"
]
}
],
"userModified": false,
"replaceAll": false
}
}