{
"tool_response": {
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/domain\/services\/text_tokenizer.py",
"oldString": "@dataclass(frozen=True, slots=True)\nclass TokenFrequency:\n token: str\n count: int\n\n\nclass TextTokenizer:\n def tokenize(self, text: str) -> list[TokenFrequency]:\n counter: Counter[str] = Counter()\n for raw in _TOKEN_RE.findall(text):\n lowered = raw.lower()\n if lowered in _STOPWORDS:\n continue\n if not (_MIN_LEN <= len(lowered) <= _MAX_LEN):\n continue\n counter[lowered] += 1\n return [TokenFrequency(token=t, count=n) for t, n in counter.items()]",
"newString": "@dataclass(frozen=True, slots=True)\nclass TokenFrequency:\n token: str\n count: int\n lemma: str | None = None\n\n\nclass TextTokenizer:\n def tokenize(self, text: str) -> list[TokenFrequency]:\n counter: Counter[str] = Counter()\n for raw in _TOKEN_RE.findall(text):\n lowered = raw.lower()\n if lowered in _STOPWORDS:\n continue\n if not (_MIN_LEN <= len(lowered) <= _MAX_LEN):\n continue\n counter[lowered] += 1\n return [TokenFrequency(token=t, count=n) for t, n in counter.items()]",
"originalFile": "\"\"\"Tokenizes body text into bag-of-words for TF-IDF analysis.\n\nRules:\n- lowercase\n- split on non-letter characters (supports äöüß + english)\n- tokens of length 3..80\n- drop German + English stopwords\n- drop pure-digit tokens\n\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom collections import Counter\nfrom dataclasses import dataclass\n\n_TOKEN_RE = re.compile(r\"[a-zäöüß]{3,80}\", re.IGNORECASE)\n_STOPWORDS: frozenset[str] = frozenset(\n {\n \"der\",\n \"die\",\n \"das\",\n \"den\",\n \"dem\",\n \"des\",\n \"ein\",\n \"eine\",\n \"einen\",\n \"einem\",\n \"eines\",\n \"einer\",\n \"und\",\n \"oder\",\n \"aber\",\n \"doch\",\n \"sondern\",\n \"denn\",\n \"weil\",\n \"als\",\n \"wenn\",\n \"dann\",\n \"dass\",\n \"daß\",\n \"ob\",\n \"während\",\n \"bevor\",\n \"nachdem\",\n \"seit\",\n \"seitdem\",\n \"bis\",\n \"damit\",\n \"falls\",\n \"sofern\",\n \"obwohl\",\n \"obgleich\",\n \"trotzdem\",\n \"dennoch\",\n \"jedoch\",\n \"allerdings\",\n \"zwar\",\n \"ist\",\n \"sind\",\n \"war\",\n \"waren\",\n \"wird\",\n \"werden\",\n \"wurde\",\n \"wurden\",\n \"hat\",\n \"haben\",\n \"hatte\",\n \"hatten\",\n \"sein\",\n \"bin\",\n \"bist\",\n \"habe\",\n \"hast\",\n \"kann\",\n \"kannst\",\n \"könnt\",\n \"können\",\n \"konnte\",\n \"konnten\",\n \"muss\",\n \"müssen\",\n \"soll\",\n \"sollen\",\n \"sollte\",\n \"mag\",\n \"mögen\",\n \"darf\",\n \"dürfen\",\n \"ich\",\n \"du\",\n \"er\",\n \"sie\",\n \"es\",\n \"wir\",\n \"ihr\",\n \"mein\",\n \"dein\",\n \"unser\",\n \"euer\",\n \"mich\",\n \"dich\",\n \"sich\",\n \"uns\",\n \"euch\",\n \"ihn\",\n \"ihm\",\n \"ihnen\",\n \"ihre\",\n \"ihres\",\n \"ihrer\",\n \"meine\",\n \"meiner\",\n \"deine\",\n \"in\",\n \"im\",\n \"an\",\n \"am\",\n \"auf\",\n \"für\",\n \"fuer\",\n \"bei\",\n \"mit\",\n \"von\",\n \"vom\",\n \"zu\",\n \"zum\",\n \"zur\",\n \"aus\",\n \"nach\",\n \"vor\",\n \"durch\",\n \"gegen\",\n \"ohne\",\n \"um\",\n \"über\",\n \"ueber\",\n \"unter\",\n \"neben\",\n \"hinter\",\n \"zwischen\",\n \"außer\",\n \"auch\",\n \"noch\",\n \"schon\",\n \"nur\",\n \"sehr\",\n \"mehr\",\n \"hier\",\n \"dort\",\n \"wieder\",\n \"nicht\",\n \"kein\",\n \"keine\",\n \"keinen\",\n \"keinem\",\n \"alle\",\n \"alles\",\n \"jeder\",\n \"jede\",\n \"jedes\",\n \"viele\",\n \"viel\",\n \"wenig\",\n \"einige\",\n \"manche\",\n \"andere\",\n \"wie\",\n \"was\",\n \"wer\",\n \"wem\",\n \"wen\",\n \"wessen\",\n \"welche\",\n \"welcher\",\n \"welches\",\n \"warum\",\n \"wieso\",\n \"weshalb\",\n \"wann\",\n \"wo\",\n \"woher\",\n \"wohin\",\n \"ja\",\n \"nein\",\n \"vielleicht\",\n \"immer\",\n \"nie\",\n \"manchmal\",\n \"oft\",\n \"selten\",\n \"man\",\n \"jemand\",\n \"niemand\",\n \"etwas\",\n \"nichts\",\n \"the\",\n \"and\",\n \"or\",\n \"of\",\n \"to\",\n \"a\",\n \"for\",\n \"with\",\n \"on\",\n \"at\",\n \"by\",\n \"is\",\n \"are\",\n \"were\",\n \"be\",\n \"been\",\n \"being\",\n \"have\",\n \"has\",\n \"had\",\n \"do\",\n \"does\",\n \"did\",\n \"will\",\n \"would\",\n \"can\",\n \"could\",\n \"this\",\n \"that\",\n \"these\",\n \"those\",\n \"it\",\n \"its\",\n \"but\",\n \"not\",\n \"you\",\n \"your\",\n }\n)\n_MIN_LEN = 3\n_MAX_LEN = 80\n\n\n@dataclass(frozen=True, slots=True)\nclass TokenFrequency:\n token: str\n count: int\n\n\nclass TextTokenizer:\n def tokenize(self, text: str) -> list[TokenFrequency]:\n counter: Counter[str] = Counter()\n for raw in _TOKEN_RE.findall(text):\n lowered = raw.lower()\n if lowered in _STOPWORDS:\n continue\n if not (_MIN_LEN <= len(lowered) <= _MAX_LEN):\n continue\n counter[lowered] += 1\n return [TokenFrequency(token=t, count=n) for t, n in counter.items()]\n",
"structuredPatch": [
{
"oldStart": 244,
"oldLines": 6,
"newStart": 244,
"newLines": 7,
"lines": [
" class TokenFrequency:",
" token: str",
" count: int",
"+ lemma: str | None = None",
" ",
" ",
" class TextTokenizer:"
]
}
],
"userModified": false,
"replaceAll": false
}
}