{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_analysis.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"Quick TF-IDF analysis from crawler DB.\n\nCorpus pro Seite: title + meta_description + h1_texts + h2_texts.\nKein Body-Text vorhanden → TF-IDF basiert auf SEO-gewichteten Feldern.\n\nUsage: python scripts\/tfidf_analysis.py <crawl_id> [top_n]\n\"\"\"\nfrom __future__ import annotations\n\nimport json\nimport math\nimport re\nimport sys\nfrom collections import Counter\nfrom typing import Any\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector # noqa: E402\nfrom config import DB_CONFIG # noqa: E402\n\n_STOP_DE = {\n \"der\", \"die\", \"das\", \"den\", \"dem\", \"des\", \"und\", \"oder\", \"aber\", \"ist\",\n \"sind\", \"war\", \"waren\", \"wird\", \"werden\", \"wurde\", \"ein\", \"eine\", \"einen\",\n \"einem\", \"eines\", \"einer\", \"im\", \"in\", \"an\", \"auf\", \"zu\", \"zum\", \"zur\",\n \"bei\", \"mit\", \"von\", \"vom\", \"für\", \"fuer\", \"ueber\", \"über\", \"auch\", \"als\",\n \"nicht\", \"kein\", \"keine\", \"es\", \"er\", \"sie\", \"wir\", \"ihr\", \"du\", \"ich\",\n \"man\", \"sich\", \"mein\", \"dein\", \"sein\", \"unser\", \"euer\", \"das\", \"dies\",\n \"so\", \"wie\", \"wenn\", \"dann\", \"noch\", \"schon\", \"nur\", \"sehr\", \"mehr\",\n \"gibt\", \"gibts\", \"haben\", \"hat\", \"wird\", \"kann\", \"kannst\", \"koennen\",\n \"können\", \"dass\", \"weil\", \"durch\", \"ueber\", \"aus\", \"um\", \"nach\", \"vor\",\n \"bis\", \"ohne\", \"gegen\", \"zwischen\", \"unter\", \"am\", \"hier\", \"dort\", \"wer\",\n \"was\", \"welche\", \"welcher\", \"welches\", \"dich\", \"dir\", \"mich\", \"mir\",\n \"uns\", \"euch\", \"ihre\", \"ihm\", \"ihn\", \"die\", \"the\", \"and\", \"or\", \"of\",\n \"to\", \"a\", \"in\", \"for\", \"with\", \"on\", \"at\", \"by\", \"is\", \"are\",\n}\n_TOKEN_RE = re.compile(r\"[a-zäöüß]{3,}\", re.IGNORECASE)\n\n\ndef tokenize(text: str) -> list[str]:\n return [t for t in _TOKEN_RE.findall(text.lower()) if t not in _STOP_DE]\n\n\ndef _corpus_for_page(row: dict[str, Any]) -> str:\n parts: list[str] = []\n for key in (\"title\", \"meta_description\"):\n if row.get(key):\n parts.append(str(row[key]))\n for key in (\"h1_texts\", \"h2_texts\"):\n raw = row.get(key)\n if raw:\n parts.extend(json.loads(raw))\n return \" \".join(parts)\n\n\ndef main() -> int:\n if len(sys.argv) < 2:\n print(\"usage: tfidf_analysis.py <crawl_id> [top_n]\", file=sys.stderr)\n return 2\n crawl_id = int(sys.argv[1])\n top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 20\n\n conn = mysql.connector.connect(\n host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n password=DB_CONFIG[\"password\"], database=\"crawler\",\n )\n cur = conn.cursor(dictionary=True)\n cur.execute(\n \"SELECT u.url, p.title, p.meta_description, p.h1_texts, p.h2_texts\"\n \" FROM pages p JOIN urls u ON u.id=p.url_id\"\n \" WHERE p.crawl_id=%s AND p.http_status=200\",\n (crawl_id,),\n )\n rows = cur.fetchall()\n conn.close()\n\n docs = [(r[\"url\"], tokenize(_corpus_for_page(r))) for r in rows]\n docs = [(u, tokens) for u, tokens in docs if tokens]\n num_docs = len(docs)\n\n df: Counter[str] = Counter()\n for _, tokens in docs:\n df.update(set(tokens))\n\n global_scores: Counter[str] = Counter()\n per_page_top: list[tuple[str, list[tuple[str, float]]]] = []\n for url, tokens in docs:\n tf = Counter(tokens)\n doc_len = len(tokens)\n scores: dict[str, float] = {}\n for term, count in tf.items():\n idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n scores[term] = (count \/ doc_len) * idf\n for term, score in scores.items():\n global_scores[term] += score\n top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:3]\n per_page_top.append((url, top_terms))\n\n print(f\"Dokumente: {num_docs}\")\n print(f\"Unique Terms: {len(df)}\")\n print()\n print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n for term, score in global_scores.most_common(top_n):\n print(f\" {score:7.3f} df={df[term]:3d} {term}\")\n print()\n print(\"=== Top-3-Terme pro Seite (Auswahl, sortiert nach Top-Score) ===\")\n per_page_top.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n for url, terms in per_page_top[:15]:\n joined = \", \".join(f\"{t}({s:.2f})\" for t, s in terms)\n print(f\" {url}\")\n print(f\" {joined}\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_analysis.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"Quick TF-IDF analysis from crawler DB.\n\nCorpus pro Seite: title + meta_description + h1_texts + h2_texts.\nKein Body-Text vorhanden → TF-IDF basiert auf SEO-gewichteten Feldern.\n\nUsage: python scripts\/tfidf_analysis.py <crawl_id> [top_n]\n\"\"\"\nfrom __future__ import annotations\n\nimport json\nimport math\nimport re\nimport sys\nfrom collections import Counter\nfrom typing import Any\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector # noqa: E402\nfrom config import DB_CONFIG # noqa: E402\n\n_STOP_DE = {\n \"der\", \"die\", \"das\", \"den\", \"dem\", \"des\", \"und\", \"oder\", \"aber\", \"ist\",\n \"sind\", \"war\", \"waren\", \"wird\", \"werden\", \"wurde\", \"ein\", \"eine\", \"einen\",\n \"einem\", \"eines\", \"einer\", \"im\", \"in\", \"an\", \"auf\", \"zu\", \"zum\", \"zur\",\n \"bei\", \"mit\", \"von\", \"vom\", \"für\", \"fuer\", \"ueber\", \"über\", \"auch\", \"als\",\n \"nicht\", \"kein\", \"keine\", \"es\", \"er\", \"sie\", \"wir\", \"ihr\", \"du\", \"ich\",\n \"man\", \"sich\", \"mein\", \"dein\", \"sein\", \"unser\", \"euer\", \"das\", \"dies\",\n \"so\", \"wie\", \"wenn\", \"dann\", \"noch\", \"schon\", \"nur\", \"sehr\", \"mehr\",\n \"gibt\", \"gibts\", \"haben\", \"hat\", \"wird\", \"kann\", \"kannst\", \"koennen\",\n \"können\", \"dass\", \"weil\", \"durch\", \"ueber\", \"aus\", \"um\", \"nach\", \"vor\",\n \"bis\", \"ohne\", \"gegen\", \"zwischen\", \"unter\", \"am\", \"hier\", \"dort\", \"wer\",\n \"was\", \"welche\", \"welcher\", \"welches\", \"dich\", \"dir\", \"mich\", \"mir\",\n \"uns\", \"euch\", \"ihre\", \"ihm\", \"ihn\", \"die\", \"the\", \"and\", \"or\", \"of\",\n \"to\", \"a\", \"in\", \"for\", \"with\", \"on\", \"at\", \"by\", \"is\", \"are\",\n}\n_TOKEN_RE = re.compile(r\"[a-zäöüß]{3,}\", re.IGNORECASE)\n\n\ndef tokenize(text: str) -> list[str]:\n return [t for t in _TOKEN_RE.findall(text.lower()) if t not in _STOP_DE]\n\n\ndef _corpus_for_page(row: dict[str, Any]) -> str:\n parts: list[str] = []\n for key in (\"title\", \"meta_description\"):\n if row.get(key):\n parts.append(str(row[key]))\n for key in (\"h1_texts\", \"h2_texts\"):\n raw = row.get(key)\n if raw:\n parts.extend(json.loads(raw))\n return \" \".join(parts)\n\n\ndef main() -> int:\n if len(sys.argv) < 2:\n print(\"usage: tfidf_analysis.py <crawl_id> [top_n]\", file=sys.stderr)\n return 2\n crawl_id = int(sys.argv[1])\n top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 20\n\n conn = mysql.connector.connect(\n host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n password=DB_CONFIG[\"password\"], database=\"crawler\",\n )\n cur = conn.cursor(dictionary=True)\n cur.execute(\n \"SELECT u.url, p.title, p.meta_description, p.h1_texts, p.h2_texts\"\n \" FROM pages p JOIN urls u ON u.id=p.url_id\"\n \" WHERE p.crawl_id=%s AND p.http_status=200\",\n (crawl_id,),\n )\n rows = cur.fetchall()\n conn.close()\n\n docs = [(r[\"url\"], tokenize(_corpus_for_page(r))) for r in rows]\n docs = [(u, tokens) for u, tokens in docs if tokens]\n num_docs = len(docs)\n\n df: Counter[str] = Counter()\n for _, tokens in docs:\n df.update(set(tokens))\n\n global_scores: Counter[str] = Counter()\n per_page_top: list[tuple[str, list[tuple[str, float]]]] = []\n for url, tokens in docs:\n tf = Counter(tokens)\n doc_len = len(tokens)\n scores: dict[str, float] = {}\n for term, count in tf.items():\n idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n scores[term] = (count \/ doc_len) * idf\n for term, score in scores.items():\n global_scores[term] += score\n top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:3]\n per_page_top.append((url, top_terms))\n\n print(f\"Dokumente: {num_docs}\")\n print(f\"Unique Terms: {len(df)}\")\n print()\n print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n for term, score in global_scores.most_common(top_n):\n print(f\" {score:7.3f} df={df[term]:3d} {term}\")\n print()\n print(\"=== Top-3-Terme pro Seite (Auswahl, sortiert nach Top-Score) ===\")\n per_page_top.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n for url, terms in per_page_top[:15]:\n joined = \", \".join(f\"{t}({s:.2f})\" for t, s in terms)\n print(f\" {url}\")\n print(f\" {joined}\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}