{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_from_tokens.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF analysis from persisted page_tokens (full body text).\n\nUsage: python scripts\/tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector # noqa: E402\nfrom config import DB_CONFIG # noqa: E402\n\n\ndef main() -> int:\n if len(sys.argv) < 2:\n print(\"usage: tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n return 2\n crawl_id = int(sys.argv[1])\n top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 15\n\n conn = mysql.connector.connect(\n host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n password=DB_CONFIG[\"password\"], database=\"crawler\",\n )\n cur = conn.cursor(dictionary=True)\n cur.execute(\n \"SELECT p.id AS page_id, u.url, pt.token, pt.tf\"\n \" FROM page_tokens pt\"\n \" JOIN pages p ON p.id = pt.page_id\"\n \" JOIN urls u ON u.id = p.url_id\"\n \" WHERE p.crawl_id=%s AND p.http_status=200\",\n (crawl_id,),\n )\n rows = cur.fetchall()\n conn.close()\n if not rows:\n print(\"no tokens found\")\n return 1\n\n url_by_page: dict[int, str] = {}\n tf_by_page: dict[int, dict[str, int]] = defaultdict(dict)\n for r in rows:\n pid = int(r[\"page_id\"])\n url_by_page[pid] = str(r[\"url\"])\n tf_by_page[pid][str(r[\"token\"])] = int(r[\"tf\"])\n\n num_docs = len(tf_by_page)\n df: Counter[str] = Counter()\n for tokens in tf_by_page.values():\n df.update(tokens.keys())\n\n global_scores: Counter[str] = Counter()\n per_page: list[tuple[str, list[tuple[str, float]]]] = []\n for page_id, tokens in tf_by_page.items():\n doc_len = sum(tokens.values())\n scores: dict[str, float] = {}\n for term, tf in tokens.items():\n idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n scores[term] = (tf \/ doc_len) * idf\n for term, score in scores.items():\n global_scores[term] += score\n top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]\n per_page.append((url_by_page[page_id], top_terms))\n\n print(f\"Dokumente: {num_docs}\")\n print(f\"Unique Terms: {len(df)}\")\n print(f\"Token-Vorkommen gesamt: {sum(sum(t.values()) for t in tf_by_page.values())}\")\n print()\n print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n for term, score in global_scores.most_common(top_n):\n print(f\" {score:8.3f} df={df[term]:3d} {term}\")\n print()\n print(f\"=== Top-5-Terms pro Seite (erste {per_page_limit} nach Top-Score) ===\")\n per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n for url, terms in per_page[:per_page_limit]:\n joined = \", \".join(f\"{t}({s:.3f})\" for t, s in terms)\n print(f\" {url}\")\n print(f\" {joined}\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_from_tokens.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF analysis from persisted page_tokens (full body text).\n\nUsage: python scripts\/tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector # noqa: E402\nfrom config import DB_CONFIG # noqa: E402\n\n\ndef main() -> int:\n if len(sys.argv) < 2:\n print(\"usage: tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n return 2\n crawl_id = int(sys.argv[1])\n top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 15\n\n conn = mysql.connector.connect(\n host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n password=DB_CONFIG[\"password\"], database=\"crawler\",\n )\n cur = conn.cursor(dictionary=True)\n cur.execute(\n \"SELECT p.id AS page_id, u.url, pt.token, pt.tf\"\n \" FROM page_tokens pt\"\n \" JOIN pages p ON p.id = pt.page_id\"\n \" JOIN urls u ON u.id = p.url_id\"\n \" WHERE p.crawl_id=%s AND p.http_status=200\",\n (crawl_id,),\n )\n rows = cur.fetchall()\n conn.close()\n if not rows:\n print(\"no tokens found\")\n return 1\n\n url_by_page: dict[int, str] = {}\n tf_by_page: dict[int, dict[str, int]] = defaultdict(dict)\n for r in rows:\n pid = int(r[\"page_id\"])\n url_by_page[pid] = str(r[\"url\"])\n tf_by_page[pid][str(r[\"token\"])] = int(r[\"tf\"])\n\n num_docs = len(tf_by_page)\n df: Counter[str] = Counter()\n for tokens in tf_by_page.values():\n df.update(tokens.keys())\n\n global_scores: Counter[str] = Counter()\n per_page: list[tuple[str, list[tuple[str, float]]]] = []\n for page_id, tokens in tf_by_page.items():\n doc_len = sum(tokens.values())\n scores: dict[str, float] = {}\n for term, tf in tokens.items():\n idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n scores[term] = (tf \/ doc_len) * idf\n for term, score in scores.items():\n global_scores[term] += score\n top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]\n per_page.append((url_by_page[page_id], top_terms))\n\n print(f\"Dokumente: {num_docs}\")\n print(f\"Unique Terms: {len(df)}\")\n print(f\"Token-Vorkommen gesamt: {sum(sum(t.values()) for t in tf_by_page.values())}\")\n print()\n print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n for term, score in global_scores.most_common(top_n):\n print(f\" {score:8.3f} df={df[term]:3d} {term}\")\n print()\n print(f\"=== Top-5-Terms pro Seite (erste {per_page_limit} nach Top-Score) ===\")\n per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n for url, terms in per_page[:per_page_limit]:\n joined = \", \".join(f\"{t}({s:.3f})\" for t, s in terms)\n print(f\" {url}\")\n print(f\" {joined}\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}