{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/domain\/services\/text_tokenizer.py",
"content": "\"\"\"Tokenizes body text into bag-of-words for TF-IDF analysis.\n\nRules:\n- lowercase\n- split on non-letter characters (supports äöüß + english)\n- tokens of length 3..80\n- drop German + English stopwords\n- drop pure-digit tokens\n\"\"\"\nfrom __future__ import annotations\n\nimport re\nfrom collections import Counter\nfrom dataclasses import dataclass\n\n_TOKEN_RE = re.compile(r\"[a-zäöüß]{3,80}\", re.IGNORECASE)\n_STOPWORDS: frozenset[str] = frozenset({\n \"der\", \"die\", \"das\", \"den\", \"dem\", \"des\", \"ein\", \"eine\", \"einen\", \"einem\",\n \"eines\", \"einer\", \"und\", \"oder\", \"aber\", \"doch\", \"sondern\", \"denn\", \"weil\",\n \"als\", \"wenn\", \"dann\", \"dass\", \"daß\", \"ob\", \"weil\", \"während\", \"bevor\",\n \"nachdem\", \"seit\", \"seitdem\", \"bis\", \"damit\", \"falls\", \"sofern\", \"obwohl\",\n \"obgleich\", \"trotzdem\", \"dennoch\", \"jedoch\", \"allerdings\", \"zwar\",\n \"ist\", \"sind\", \"war\", \"waren\", \"wird\", \"werden\", \"wurde\", \"wurden\",\n \"hat\", \"haben\", \"hatte\", \"hatten\", \"sein\", \"bin\", \"bist\", \"habe\", \"hast\",\n \"kann\", \"kannst\", \"könnt\", \"können\", \"konnte\", \"konnten\", \"muss\", \"müssen\",\n \"soll\", \"sollen\", \"sollte\", \"mag\", \"mögen\", \"darf\", \"dürfen\",\n \"ich\", \"du\", \"er\", \"sie\", \"es\", \"wir\", \"ihr\", \"mein\", \"dein\", \"sein\", \"ihr\",\n \"unser\", \"euer\", \"mich\", \"dich\", \"sich\", \"uns\", \"euch\", \"ihn\", \"ihm\",\n \"ihnen\", \"ihre\", \"ihres\", \"ihrer\", \"meine\", \"meiner\", \"deine\",\n \"in\", \"im\", \"an\", \"am\", \"auf\", \"für\", \"fuer\", \"bei\", \"mit\", \"von\", \"vom\",\n \"zu\", \"zum\", \"zur\", \"aus\", \"nach\", \"vor\", \"durch\", \"gegen\", \"ohne\", \"um\",\n \"über\", \"ueber\", \"unter\", \"neben\", \"hinter\", \"zwischen\", \"außer\",\n \"auch\", \"noch\", \"schon\", \"nur\", \"sehr\", \"mehr\", \"hier\", \"dort\", \"wieder\",\n \"nicht\", \"kein\", \"keine\", \"keinen\", \"keinem\", \"alle\", \"alles\", \"jeder\",\n \"jede\", \"jedes\", \"viele\", \"viel\", \"wenig\", \"einige\", \"manche\", \"andere\",\n \"wie\", \"was\", \"wer\", \"wem\", \"wen\", \"wessen\", \"welche\", \"welcher\", \"welches\",\n \"warum\", \"wieso\", \"weshalb\", \"wann\", \"wo\", \"woher\", \"wohin\",\n \"ja\", \"nein\", \"vielleicht\", \"immer\", \"nie\", \"manchmal\", \"oft\", \"selten\",\n \"man\", \"jemand\", \"niemand\", \"etwas\", \"nichts\", \"alles\",\n \"the\", \"and\", \"or\", \"of\", \"to\", \"a\", \"an\", \"in\", \"for\", \"with\", \"on\",\n \"at\", \"by\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\",\n \"has\", \"had\", \"do\", \"does\", \"did\", \"will\", \"would\", \"can\", \"could\",\n \"this\", \"that\", \"these\", \"those\", \"it\", \"its\", \"but\", \"not\", \"you\", \"your\",\n})\n_MIN_LEN = 3\n_MAX_LEN = 80\n\n\n@dataclass(frozen=True, slots=True)\nclass TokenFrequency:\n token: str\n count: int\n\n\nclass TextTokenizer:\n def tokenize(self, text: str) -> list[TokenFrequency]:\n counter: Counter[str] = Counter()\n for raw in _TOKEN_RE.findall(text):\n lowered = raw.lower()\n if lowered in _STOPWORDS:\n continue\n if not (_MIN_LEN <= len(lowered) <= _MAX_LEN):\n continue\n counter[lowered] += 1\n return [TokenFrequency(token=t, count=n) for t, n in counter.items()]\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/src\/domain\/services\/text_tokenizer.py",
"content": "\"\"\"Tokenizes body text into bag-of-words for TF-IDF analysis.\n\nRules:\n- lowercase\n- split on non-letter characters (supports äöüß + english)\n- tokens of length 3..80\n- drop German + English stopwords\n- drop pure-digit tokens\n\"\"\"\nfrom __future__ import annotations\n\nimport re\nfrom collections import Counter\nfrom dataclasses import dataclass\n\n_TOKEN_RE = re.compile(r\"[a-zäöüß]{3,80}\", re.IGNORECASE)\n_STOPWORDS: frozenset[str] = frozenset({\n \"der\", \"die\", \"das\", \"den\", \"dem\", \"des\", \"ein\", \"eine\", \"einen\", \"einem\",\n \"eines\", \"einer\", \"und\", \"oder\", \"aber\", \"doch\", \"sondern\", \"denn\", \"weil\",\n \"als\", \"wenn\", \"dann\", \"dass\", \"daß\", \"ob\", \"weil\", \"während\", \"bevor\",\n \"nachdem\", \"seit\", \"seitdem\", \"bis\", \"damit\", \"falls\", \"sofern\", \"obwohl\",\n \"obgleich\", \"trotzdem\", \"dennoch\", \"jedoch\", \"allerdings\", \"zwar\",\n \"ist\", \"sind\", \"war\", \"waren\", \"wird\", \"werden\", \"wurde\", \"wurden\",\n \"hat\", \"haben\", \"hatte\", \"hatten\", \"sein\", \"bin\", \"bist\", \"habe\", \"hast\",\n \"kann\", \"kannst\", \"könnt\", \"können\", \"konnte\", \"konnten\", \"muss\", \"müssen\",\n \"soll\", \"sollen\", \"sollte\", \"mag\", \"mögen\", \"darf\", \"dürfen\",\n \"ich\", \"du\", \"er\", \"sie\", \"es\", \"wir\", \"ihr\", \"mein\", \"dein\", \"sein\", \"ihr\",\n \"unser\", \"euer\", \"mich\", \"dich\", \"sich\", \"uns\", \"euch\", \"ihn\", \"ihm\",\n \"ihnen\", \"ihre\", \"ihres\", \"ihrer\", \"meine\", \"meiner\", \"deine\",\n \"in\", \"im\", \"an\", \"am\", \"auf\", \"für\", \"fuer\", \"bei\", \"mit\", \"von\", \"vom\",\n \"zu\", \"zum\", \"zur\", \"aus\", \"nach\", \"vor\", \"durch\", \"gegen\", \"ohne\", \"um\",\n \"über\", \"ueber\", \"unter\", \"neben\", \"hinter\", \"zwischen\", \"außer\",\n \"auch\", \"noch\", \"schon\", \"nur\", \"sehr\", \"mehr\", \"hier\", \"dort\", \"wieder\",\n \"nicht\", \"kein\", \"keine\", \"keinen\", \"keinem\", \"alle\", \"alles\", \"jeder\",\n \"jede\", \"jedes\", \"viele\", \"viel\", \"wenig\", \"einige\", \"manche\", \"andere\",\n \"wie\", \"was\", \"wer\", \"wem\", \"wen\", \"wessen\", \"welche\", \"welcher\", \"welches\",\n \"warum\", \"wieso\", \"weshalb\", \"wann\", \"wo\", \"woher\", \"wohin\",\n \"ja\", \"nein\", \"vielleicht\", \"immer\", \"nie\", \"manchmal\", \"oft\", \"selten\",\n \"man\", \"jemand\", \"niemand\", \"etwas\", \"nichts\", \"alles\",\n \"the\", \"and\", \"or\", \"of\", \"to\", \"a\", \"an\", \"in\", \"for\", \"with\", \"on\",\n \"at\", \"by\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\",\n \"has\", \"had\", \"do\", \"does\", \"did\", \"will\", \"would\", \"can\", \"could\",\n \"this\", \"that\", \"these\", \"those\", \"it\", \"its\", \"but\", \"not\", \"you\", \"your\",\n})\n_MIN_LEN = 3\n_MAX_LEN = 80\n\n\n@dataclass(frozen=True, slots=True)\nclass TokenFrequency:\n token: str\n count: int\n\n\nclass TextTokenizer:\n def tokenize(self, text: str) -> list[TokenFrequency]:\n counter: Counter[str] = Counter()\n for raw in _TOKEN_RE.findall(text):\n lowered = raw.lower()\n if lowered in _STOPWORDS:\n continue\n if not (_MIN_LEN <= len(lowered) <= _MAX_LEN):\n continue\n counter[lowered] += 1\n return [TokenFrequency(token=t, count=n) for t, n in counter.items()]\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}