Protokoll #34014

ID34014
Zeitstempel2026-04-21 04:59:30.844972
Clientroot
IP62.246.34.195
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens1,696 (Input: 843, Output: 853)
Dauer67 ms
Request-Zeit2026-04-21 04:59:30.844972
Response-Zeit2026-04-21 04:59:30.911841

Request

{
    "event": "PreToolUse",
    "tool_name": "Write",
    "tool_input": {
        "file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_from_tokens.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF analysis from persisted page_tokens (full body text).\n\nUsage: python scripts\/tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector  # noqa: E402\nfrom config import DB_CONFIG  # noqa: E402\n\n\ndef main() -> int:\n    if len(sys.argv) < 2:\n        print(\"usage: tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n        return 2\n    crawl_id = int(sys.argv[1])\n    top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n    per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 15\n\n    conn = mysql.connector.connect(\n        host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n        password=DB_CONFIG[\"password\"], database=\"crawler\",\n    )\n    cur = conn.cursor(dictionary=True)\n    cur.execute(\n        \"SELECT p.id AS page_id, u.url, pt.token, pt.tf\"\n        \" FROM page_tokens pt\"\n        \" JOIN pages p ON p.id = pt.page_id\"\n        \" JOIN urls u ON u.id = p.url_id\"\n        \" WHERE p.crawl_id=%s AND p.http_status=200\",\n        (crawl_id,),\n    )\n    rows = cur.fetchall()\n    conn.close()\n    if not rows:\n        print(\"no tokens found\")\n        return 1\n\n    url_by_page: dict[int, str] = {}\n    tf_by_page: dict[int, dict[str, int]] = defaultdict(dict)\n    for r in rows:\n        pid = int(r[\"page_id\"])\n        url_by_page[pid] = str(r[\"url\"])\n        tf_by_page[pid][str(r[\"token\"])] = int(r[\"tf\"])\n\n    num_docs = len(tf_by_page)\n    df: Counter[str] = Counter()\n    for tokens in tf_by_page.values():\n        df.update(tokens.keys())\n\n    global_scores: Counter[str] = Counter()\n    per_page: list[tuple[str, list[tuple[str, float]]]] = []\n    for page_id, tokens in tf_by_page.items():\n        doc_len = sum(tokens.values())\n        scores: dict[str, float] = {}\n        for term, tf in tokens.items():\n            idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n            scores[term] = (tf \/ doc_len) * idf\n        for term, score in scores.items():\n            global_scores[term] += score\n        top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]\n        per_page.append((url_by_page[page_id], top_terms))\n\n    print(f\"Dokumente: {num_docs}\")\n    print(f\"Unique Terms: {len(df)}\")\n    print(f\"Token-Vorkommen gesamt: {sum(sum(t.values()) for t in tf_by_page.values())}\")\n    print()\n    print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n    for term, score in global_scores.most_common(top_n):\n        print(f\"  {score:8.3f}  df={df[term]:3d}  {term}\")\n    print()\n    print(f\"=== Top-5-Terms pro Seite (erste {per_page_limit} nach Top-Score) ===\")\n    per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n    for url, terms in per_page[:per_page_limit]:\n        joined = \", \".join(f\"{t}({s:.3f})\" for t, s in terms)\n        print(f\"  {url}\")\n        print(f\"      {joined}\")\n    return 0\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
    }
}

Response

{
    "tool_response": {
        "type": "create",
        "filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_from_tokens.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"TF-IDF analysis from persisted page_tokens (full body text).\n\nUsage: python scripts\/tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport math\nimport sys\nfrom collections import Counter, defaultdict\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector  # noqa: E402\nfrom config import DB_CONFIG  # noqa: E402\n\n\ndef main() -> int:\n    if len(sys.argv) < 2:\n        print(\"usage: tfidf_from_tokens.py <crawl_id> [top_n] [per_page_limit]\", file=sys.stderr)\n        return 2\n    crawl_id = int(sys.argv[1])\n    top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 30\n    per_page_limit = int(sys.argv[3]) if len(sys.argv) > 3 else 15\n\n    conn = mysql.connector.connect(\n        host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n        password=DB_CONFIG[\"password\"], database=\"crawler\",\n    )\n    cur = conn.cursor(dictionary=True)\n    cur.execute(\n        \"SELECT p.id AS page_id, u.url, pt.token, pt.tf\"\n        \" FROM page_tokens pt\"\n        \" JOIN pages p ON p.id = pt.page_id\"\n        \" JOIN urls u ON u.id = p.url_id\"\n        \" WHERE p.crawl_id=%s AND p.http_status=200\",\n        (crawl_id,),\n    )\n    rows = cur.fetchall()\n    conn.close()\n    if not rows:\n        print(\"no tokens found\")\n        return 1\n\n    url_by_page: dict[int, str] = {}\n    tf_by_page: dict[int, dict[str, int]] = defaultdict(dict)\n    for r in rows:\n        pid = int(r[\"page_id\"])\n        url_by_page[pid] = str(r[\"url\"])\n        tf_by_page[pid][str(r[\"token\"])] = int(r[\"tf\"])\n\n    num_docs = len(tf_by_page)\n    df: Counter[str] = Counter()\n    for tokens in tf_by_page.values():\n        df.update(tokens.keys())\n\n    global_scores: Counter[str] = Counter()\n    per_page: list[tuple[str, list[tuple[str, float]]]] = []\n    for page_id, tokens in tf_by_page.items():\n        doc_len = sum(tokens.values())\n        scores: dict[str, float] = {}\n        for term, tf in tokens.items():\n            idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n            scores[term] = (tf \/ doc_len) * idf\n        for term, score in scores.items():\n            global_scores[term] += score\n        top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]\n        per_page.append((url_by_page[page_id], top_terms))\n\n    print(f\"Dokumente: {num_docs}\")\n    print(f\"Unique Terms: {len(df)}\")\n    print(f\"Token-Vorkommen gesamt: {sum(sum(t.values()) for t in tf_by_page.values())}\")\n    print()\n    print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n    for term, score in global_scores.most_common(top_n):\n        print(f\"  {score:8.3f}  df={df[term]:3d}  {term}\")\n    print()\n    print(f\"=== Top-5-Terms pro Seite (erste {per_page_limit} nach Top-Score) ===\")\n    per_page.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n    for url, terms in per_page[:per_page_limit]:\n        joined = \", \".join(f\"{t}({s:.3f})\" for t, s in terms)\n        print(f\"  {url}\")\n        print(f\"      {joined}\")\n    return 0\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n",
        "structuredPatch": [],
        "originalFile": null,
        "userModified": false
    }
}
← Vorheriger Zur Liste Nächster →