{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_leipzig.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF with external corpus (Leipzig Wortschatz) as IDF source.\n\nTF = (count on page) \/ (total tokens on page) — by lemma\nIDF = log(total_corpus_tokens \/ (1 + corpus_freq(lemma)))\nScore = TF * IDF. Rare-in-corpus + frequent-on-page => high score.\n\nUsage: python scripts\/tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector # noqa: E402\nfrom config import DB_CONFIG # noqa: E402\n\n_CORPUS = \"leipzig_news_2024\"\n\n\ndef main() -> int:\n if len(sys.argv) < 2:\n print(\"usage: tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n return 2\n crawl_id = int(sys.argv[1])\n top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20\n\n conn = mysql.connector.connect(\n host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n password=DB_CONFIG[\"password\"], database=\"crawler\",\n )\n cur = conn.cursor(dictionary=True)\n cur.execute(\"SELECT SUM(frequency) AS n FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n row = cur.fetchone()\n total_corpus = int(row[\"n\"]) if row and row[\"n\"] else 0\n if total_corpus == 0:\n print(\"reference corpus is empty\")\n return 1\n\n cur.execute(\"SELECT token, frequency FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n corpus_freq = {r[\"token\"]: int(r[\"frequency\"]) for r in cur.fetchall()}\n\n cur.execute(\n \"SELECT p.id AS page_id, u.url, pt.lemma, pt.token, pt.tf\"\n \" FROM page_tokens pt JOIN pages p ON p.id=pt.page_id\"\n \" JOIN urls u ON u.id=p.url_id\"\n \" WHERE p.crawl_id=%s AND p.http_status=200 AND pt.lemma IS NOT NULL\",\n (crawl_id,),\n )\n rows = cur.fetchall()\n conn.close()\n\n url_by_page: dict[int, str] = {}\n tf_by_page: dict[int, Counter[str]] = defaultdict(Counter)\n for r in rows:\n pid = int(r[\"page_id\"])\n url_by_page[pid] = str(r[\"url\"])\n tf_by_page[pid][str(r[\"lemma\"])] += int(r[\"tf\"])\n\n def idf(term: str) -> float:\n freq = corpus_freq.get(term, 0)\n return math.log(total_corpus \/ (1 + freq))\n\n global_scores: Counter[str] = Counter()\n per_page: list[tuple[str, list[tuple[str, float]]]] = []\n oov_total = 0\n for page_id, tf in tf_by_page.items():\n doc_len = sum(tf.values())\n scores: dict[str, float] = {}\n for lemma, count in tf.items():\n if lemma not in corpus_freq:\n oov_total += count\n scores[lemma] = (count \/ doc_len) * idf(lemma)\n for lemma, score in scores.items():\n global_scores[lemma] += score\n top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:6]\n per_page.append((url_by_page[page_id], top_terms))\n\n print(f\"Dokumente: {len(tf_by_page)}\")\n print(f\"Unique Lemmas (Seite): {len({lemma for tf in tf_by_page.values() for lemma in tf})}\")\n print(f\"Corpus-Terms (Leipzig): {len(corpus_freq)}\")\n print(f\"Corpus-Tokens (Leipzig): {total_corpus}\")\n print(f\"OOV-Token-Vorkommen: {oov_total} (nicht im Corpus → max IDF)\")\n print()\n print(f\"=== Top {top_n} Lemmas (Σ TF·IDF über alle Seiten) ===\")\n for lemma, score in global_scores.most_common(top_n):\n cf = corpus_freq.get(lemma, 0)\n marker = \" [OOV]\" if cf == 0 else \"\"\n print(f\" {score:8.3f} corpus_freq={cf:>7} {lemma}{marker}\")\n print()\n print(f\"=== Top-6-Lemmas pro Seite (Auswahl, nach Top-Score, erste {per_page_limit}) ===\")\n per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n for url, terms in per_page[:per_page_limit]:\n parts = [f\"{t}({s:.2f})\" for t, s in terms]\n print(f\" {url}\")\n print(f\" {', '.join(parts)}\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_leipzig.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF with external corpus (Leipzig Wortschatz) as IDF source.\n\nTF = (count on page) \/ (total tokens on page) — by lemma\nIDF = log(total_corpus_tokens \/ (1 + corpus_freq(lemma)))\nScore = TF * IDF. Rare-in-corpus + frequent-on-page => high score.\n\nUsage: python scripts\/tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector # noqa: E402\nfrom config import DB_CONFIG # noqa: E402\n\n_CORPUS = \"leipzig_news_2024\"\n\n\ndef main() -> int:\n if len(sys.argv) < 2:\n print(\"usage: tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n return 2\n crawl_id = int(sys.argv[1])\n top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20\n\n conn = mysql.connector.connect(\n host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n password=DB_CONFIG[\"password\"], database=\"crawler\",\n )\n cur = conn.cursor(dictionary=True)\n cur.execute(\"SELECT SUM(frequency) AS n FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n row = cur.fetchone()\n total_corpus = int(row[\"n\"]) if row and row[\"n\"] else 0\n if total_corpus == 0:\n print(\"reference corpus is empty\")\n return 1\n\n cur.execute(\"SELECT token, frequency FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n corpus_freq = {r[\"token\"]: int(r[\"frequency\"]) for r in cur.fetchall()}\n\n cur.execute(\n \"SELECT p.id AS page_id, u.url, pt.lemma, pt.token, pt.tf\"\n \" FROM page_tokens pt JOIN pages p ON p.id=pt.page_id\"\n \" JOIN urls u ON u.id=p.url_id\"\n \" WHERE p.crawl_id=%s AND p.http_status=200 AND pt.lemma IS NOT NULL\",\n (crawl_id,),\n )\n rows = cur.fetchall()\n conn.close()\n\n url_by_page: dict[int, str] = {}\n tf_by_page: dict[int, Counter[str]] = defaultdict(Counter)\n for r in rows:\n pid = int(r[\"page_id\"])\n url_by_page[pid] = str(r[\"url\"])\n tf_by_page[pid][str(r[\"lemma\"])] += int(r[\"tf\"])\n\n def idf(term: str) -> float:\n freq = corpus_freq.get(term, 0)\n return math.log(total_corpus \/ (1 + freq))\n\n global_scores: Counter[str] = Counter()\n per_page: list[tuple[str, list[tuple[str, float]]]] = []\n oov_total = 0\n for page_id, tf in tf_by_page.items():\n doc_len = sum(tf.values())\n scores: dict[str, float] = {}\n for lemma, count in tf.items():\n if lemma not in corpus_freq:\n oov_total += count\n scores[lemma] = (count \/ doc_len) * idf(lemma)\n for lemma, score in scores.items():\n global_scores[lemma] += score\n top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:6]\n per_page.append((url_by_page[page_id], top_terms))\n\n print(f\"Dokumente: {len(tf_by_page)}\")\n print(f\"Unique Lemmas (Seite): {len({lemma for tf in tf_by_page.values() for lemma in tf})}\")\n print(f\"Corpus-Terms (Leipzig): {len(corpus_freq)}\")\n print(f\"Corpus-Tokens (Leipzig): {total_corpus}\")\n print(f\"OOV-Token-Vorkommen: {oov_total} (nicht im Corpus → max IDF)\")\n print()\n print(f\"=== Top {top_n} Lemmas (Σ TF·IDF über alle Seiten) ===\")\n for lemma, score in global_scores.most_common(top_n):\n cf = corpus_freq.get(lemma, 0)\n marker = \" [OOV]\" if cf == 0 else \"\"\n print(f\" {score:8.3f} corpus_freq={cf:>7} {lemma}{marker}\")\n print()\n print(f\"=== Top-6-Lemmas pro Seite (Auswahl, nach Top-Score, erste {per_page_limit}) ===\")\n per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n for url, terms in per_page[:per_page_limit]:\n parts = [f\"{t}({s:.2f})\" for t, s in terms]\n print(f\" {url}\")\n print(f\" {', '.join(parts)}\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n",
"structuredPatch": [],
"originalFile": null,
"userModified": false
}
}