Protokoll #34056

ID34056
Zeitstempel2026-04-21 05:25:07.696270
Clientroot
IP62.246.34.195
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens2,170 (Input: 1,080, Output: 1,090)
Dauer62 ms
Request-Zeit2026-04-21 05:25:07.696270
Response-Zeit2026-04-21 05:25:07.757881

Request

{
    "event": "PreToolUse",
    "tool_name": "Write",
    "tool_input": {
        "file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_leipzig.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF with external corpus (Leipzig Wortschatz) as IDF source.\n\nTF = (count on page) \/ (total tokens on page)  — by lemma\nIDF = log(total_corpus_tokens \/ (1 + corpus_freq(lemma)))\nScore = TF * IDF. Rare-in-corpus + frequent-on-page => high score.\n\nUsage: python scripts\/tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector  # noqa: E402\nfrom config import DB_CONFIG  # noqa: E402\n\n_CORPUS = \"leipzig_news_2024\"\n\n\ndef main() -> int:\n    if len(sys.argv) < 2:\n        print(\"usage: tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n        return 2\n    crawl_id = int(sys.argv[1])\n    top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n    per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20\n\n    conn = mysql.connector.connect(\n        host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n        password=DB_CONFIG[\"password\"], database=\"crawler\",\n    )\n    cur = conn.cursor(dictionary=True)\n    cur.execute(\"SELECT SUM(frequency) AS n FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n    row = cur.fetchone()\n    total_corpus = int(row[\"n\"]) if row and row[\"n\"] else 0\n    if total_corpus == 0:\n        print(\"reference corpus is empty\")\n        return 1\n\n    cur.execute(\"SELECT token, frequency FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n    corpus_freq = {r[\"token\"]: int(r[\"frequency\"]) for r in cur.fetchall()}\n\n    cur.execute(\n        \"SELECT p.id AS page_id, u.url, pt.lemma, pt.token, pt.tf\"\n        \" FROM page_tokens pt JOIN pages p ON p.id=pt.page_id\"\n        \" JOIN urls u ON u.id=p.url_id\"\n        \" WHERE p.crawl_id=%s AND p.http_status=200 AND pt.lemma IS NOT NULL\",\n        (crawl_id,),\n    )\n    rows = cur.fetchall()\n    conn.close()\n\n    url_by_page: dict[int, str] = {}\n    tf_by_page: dict[int, Counter[str]] = defaultdict(Counter)\n    for r in rows:\n        pid = int(r[\"page_id\"])\n        url_by_page[pid] = str(r[\"url\"])\n        tf_by_page[pid][str(r[\"lemma\"])] += int(r[\"tf\"])\n\n    def idf(term: str) -> float:\n        freq = corpus_freq.get(term, 0)\n        return math.log(total_corpus \/ (1 + freq))\n\n    global_scores: Counter[str] = Counter()\n    per_page: list[tuple[str, list[tuple[str, float]]]] = []\n    oov_total = 0\n    for page_id, tf in tf_by_page.items():\n        doc_len = sum(tf.values())\n        scores: dict[str, float] = {}\n        for lemma, count in tf.items():\n            if lemma not in corpus_freq:\n                oov_total += count\n            scores[lemma] = (count \/ doc_len) * idf(lemma)\n        for lemma, score in scores.items():\n            global_scores[lemma] += score\n        top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:6]\n        per_page.append((url_by_page[page_id], top_terms))\n\n    print(f\"Dokumente: {len(tf_by_page)}\")\n    print(f\"Unique Lemmas (Seite):   {len({lemma for tf in tf_by_page.values() for lemma in tf})}\")\n    print(f\"Corpus-Terms (Leipzig):  {len(corpus_freq)}\")\n    print(f\"Corpus-Tokens (Leipzig): {total_corpus}\")\n    print(f\"OOV-Token-Vorkommen:     {oov_total}  (nicht im Corpus → max IDF)\")\n    print()\n    print(f\"=== Top {top_n} Lemmas (Σ TF·IDF über alle Seiten) ===\")\n    for lemma, score in global_scores.most_common(top_n):\n        cf = corpus_freq.get(lemma, 0)\n        marker = \" [OOV]\" if cf == 0 else \"\"\n        print(f\"  {score:8.3f}  corpus_freq={cf:>7}  {lemma}{marker}\")\n    print()\n    print(f\"=== Top-6-Lemmas pro Seite (Auswahl, nach Top-Score, erste {per_page_limit}) ===\")\n    per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n    for url, terms in per_page[:per_page_limit]:\n        parts = [f\"{t}({s:.2f})\" for t, s in terms]\n        print(f\"  {url}\")\n        print(f\"      {', '.join(parts)}\")\n    return 0\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
    }
}

Response

{
    "tool_response": {
        "type": "create",
        "filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_leipzig.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF with external corpus (Leipzig Wortschatz) as IDF source.\n\nTF = (count on page) \/ (total tokens on page)  — by lemma\nIDF = log(total_corpus_tokens \/ (1 + corpus_freq(lemma)))\nScore = TF * IDF. Rare-in-corpus + frequent-on-page => high score.\n\nUsage: python scripts\/tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector  # noqa: E402\nfrom config import DB_CONFIG  # noqa: E402\n\n_CORPUS = \"leipzig_news_2024\"\n\n\ndef main() -> int:\n    if len(sys.argv) < 2:\n        print(\"usage: tfidf_leipzig.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n        return 2\n    crawl_id = int(sys.argv[1])\n    top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n    per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20\n\n    conn = mysql.connector.connect(\n        host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n        password=DB_CONFIG[\"password\"], database=\"crawler\",\n    )\n    cur = conn.cursor(dictionary=True)\n    cur.execute(\"SELECT SUM(frequency) AS n FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n    row = cur.fetchone()\n    total_corpus = int(row[\"n\"]) if row and row[\"n\"] else 0\n    if total_corpus == 0:\n        print(\"reference corpus is empty\")\n        return 1\n\n    cur.execute(\"SELECT token, frequency FROM reference_corpus_de WHERE corpus=%s\", (_CORPUS,))\n    corpus_freq = {r[\"token\"]: int(r[\"frequency\"]) for r in cur.fetchall()}\n\n    cur.execute(\n        \"SELECT p.id AS page_id, u.url, pt.lemma, pt.token, pt.tf\"\n        \" FROM page_tokens pt JOIN pages p ON p.id=pt.page_id\"\n        \" JOIN urls u ON u.id=p.url_id\"\n        \" WHERE p.crawl_id=%s AND p.http_status=200 AND pt.lemma IS NOT NULL\",\n        (crawl_id,),\n    )\n    rows = cur.fetchall()\n    conn.close()\n\n    url_by_page: dict[int, str] = {}\n    tf_by_page: dict[int, Counter[str]] = defaultdict(Counter)\n    for r in rows:\n        pid = int(r[\"page_id\"])\n        url_by_page[pid] = str(r[\"url\"])\n        tf_by_page[pid][str(r[\"lemma\"])] += int(r[\"tf\"])\n\n    def idf(term: str) -> float:\n        freq = corpus_freq.get(term, 0)\n        return math.log(total_corpus \/ (1 + freq))\n\n    global_scores: Counter[str] = Counter()\n    per_page: list[tuple[str, list[tuple[str, float]]]] = []\n    oov_total = 0\n    for page_id, tf in tf_by_page.items():\n        doc_len = sum(tf.values())\n        scores: dict[str, float] = {}\n        for lemma, count in tf.items():\n            if lemma not in corpus_freq:\n                oov_total += count\n            scores[lemma] = (count \/ doc_len) * idf(lemma)\n        for lemma, score in scores.items():\n            global_scores[lemma] += score\n        top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:6]\n        per_page.append((url_by_page[page_id], top_terms))\n\n    print(f\"Dokumente: {len(tf_by_page)}\")\n    print(f\"Unique Lemmas (Seite):   {len({lemma for tf in tf_by_page.values() for lemma in tf})}\")\n    print(f\"Corpus-Terms (Leipzig):  {len(corpus_freq)}\")\n    print(f\"Corpus-Tokens (Leipzig): {total_corpus}\")\n    print(f\"OOV-Token-Vorkommen:     {oov_total}  (nicht im Corpus → max IDF)\")\n    print()\n    print(f\"=== Top {top_n} Lemmas (Σ TF·IDF über alle Seiten) ===\")\n    for lemma, score in global_scores.most_common(top_n):\n        cf = corpus_freq.get(lemma, 0)\n        marker = \" [OOV]\" if cf == 0 else \"\"\n        print(f\"  {score:8.3f}  corpus_freq={cf:>7}  {lemma}{marker}\")\n    print()\n    print(f\"=== Top-6-Lemmas pro Seite (Auswahl, nach Top-Score, erste {per_page_limit}) ===\")\n    per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n    for url, terms in per_page[:per_page_limit]:\n        parts = [f\"{t}({s:.2f})\" for t, s in terms]\n        print(f\"  {url}\")\n        print(f\"      {', '.join(parts)}\")\n    return 0\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n",
        "structuredPatch": [],
        "originalFile": null,
        "userModified": false
    }
}
← Vorheriger Zur Liste Nächster →