Protokoll #33972

ID33972
Zeitstempel2026-04-21 04:39:38.489970
Clientroot
IP62.246.34.195
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens2,423 (Input: 1,206, Output: 1,217)
Dauer62 ms
Request-Zeit2026-04-21 04:39:38.489970
Response-Zeit2026-04-21 04:39:38.552001

Request

{
    "event": "PreToolUse",
    "tool_name": "Write",
    "tool_input": {
        "file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_analysis.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"Quick TF-IDF analysis from crawler DB.\n\nCorpus pro Seite: title + meta_description + h1_texts + h2_texts.\nKein Body-Text vorhanden → TF-IDF basiert auf SEO-gewichteten Feldern.\n\nUsage: python scripts\/tfidf_analysis.py <crawl_id> [top_n]\n\"\"\"\nfrom __future__ import annotations\n\nimport json\nimport math\nimport re\nimport sys\nfrom collections import Counter\nfrom typing import Any\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector  # noqa: E402\nfrom config import DB_CONFIG  # noqa: E402\n\n_STOP_DE = {\n    \"der\", \"die\", \"das\", \"den\", \"dem\", \"des\", \"und\", \"oder\", \"aber\", \"ist\",\n    \"sind\", \"war\", \"waren\", \"wird\", \"werden\", \"wurde\", \"ein\", \"eine\", \"einen\",\n    \"einem\", \"eines\", \"einer\", \"im\", \"in\", \"an\", \"auf\", \"zu\", \"zum\", \"zur\",\n    \"bei\", \"mit\", \"von\", \"vom\", \"für\", \"fuer\", \"ueber\", \"über\", \"auch\", \"als\",\n    \"nicht\", \"kein\", \"keine\", \"es\", \"er\", \"sie\", \"wir\", \"ihr\", \"du\", \"ich\",\n    \"man\", \"sich\", \"mein\", \"dein\", \"sein\", \"unser\", \"euer\", \"das\", \"dies\",\n    \"so\", \"wie\", \"wenn\", \"dann\", \"noch\", \"schon\", \"nur\", \"sehr\", \"mehr\",\n    \"gibt\", \"gibts\", \"haben\", \"hat\", \"wird\", \"kann\", \"kannst\", \"koennen\",\n    \"können\", \"dass\", \"weil\", \"durch\", \"ueber\", \"aus\", \"um\", \"nach\", \"vor\",\n    \"bis\", \"ohne\", \"gegen\", \"zwischen\", \"unter\", \"am\", \"hier\", \"dort\", \"wer\",\n    \"was\", \"welche\", \"welcher\", \"welches\", \"dich\", \"dir\", \"mich\", \"mir\",\n    \"uns\", \"euch\", \"ihre\", \"ihm\", \"ihn\", \"die\", \"the\", \"and\", \"or\", \"of\",\n    \"to\", \"a\", \"in\", \"for\", \"with\", \"on\", \"at\", \"by\", \"is\", \"are\",\n}\n_TOKEN_RE = re.compile(r\"[a-zäöüß]{3,}\", re.IGNORECASE)\n\n\ndef tokenize(text: str) -> list[str]:\n    return [t for t in _TOKEN_RE.findall(text.lower()) if t not in _STOP_DE]\n\n\ndef _corpus_for_page(row: dict[str, Any]) -> str:\n    parts: list[str] = []\n    for key in (\"title\", \"meta_description\"):\n        if row.get(key):\n            parts.append(str(row[key]))\n    for key in (\"h1_texts\", \"h2_texts\"):\n        raw = row.get(key)\n        if raw:\n            parts.extend(json.loads(raw))\n    return \" \".join(parts)\n\n\ndef main() -> int:\n    if len(sys.argv) < 2:\n        print(\"usage: tfidf_analysis.py <crawl_id> [top_n]\", file=sys.stderr)\n        return 2\n    crawl_id = int(sys.argv[1])\n    top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 20\n\n    conn = mysql.connector.connect(\n        host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n        password=DB_CONFIG[\"password\"], database=\"crawler\",\n    )\n    cur = conn.cursor(dictionary=True)\n    cur.execute(\n        \"SELECT u.url, p.title, p.meta_description, p.h1_texts, p.h2_texts\"\n        \" FROM pages p JOIN urls u ON u.id=p.url_id\"\n        \" WHERE p.crawl_id=%s AND p.http_status=200\",\n        (crawl_id,),\n    )\n    rows = cur.fetchall()\n    conn.close()\n\n    docs = [(r[\"url\"], tokenize(_corpus_for_page(r))) for r in rows]\n    docs = [(u, tokens) for u, tokens in docs if tokens]\n    num_docs = len(docs)\n\n    df: Counter[str] = Counter()\n    for _, tokens in docs:\n        df.update(set(tokens))\n\n    global_scores: Counter[str] = Counter()\n    per_page_top: list[tuple[str, list[tuple[str, float]]]] = []\n    for url, tokens in docs:\n        tf = Counter(tokens)\n        doc_len = len(tokens)\n        scores: dict[str, float] = {}\n        for term, count in tf.items():\n            idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n            scores[term] = (count \/ doc_len) * idf\n        for term, score in scores.items():\n            global_scores[term] += score\n        top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:3]\n        per_page_top.append((url, top_terms))\n\n    print(f\"Dokumente: {num_docs}\")\n    print(f\"Unique Terms: {len(df)}\")\n    print()\n    print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n    for term, score in global_scores.most_common(top_n):\n        print(f\"  {score:7.3f}  df={df[term]:3d}  {term}\")\n    print()\n    print(\"=== Top-3-Terme pro Seite (Auswahl, sortiert nach Top-Score) ===\")\n    per_page_top.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n    for url, terms in per_page_top[:15]:\n        joined = \", \".join(f\"{t}({s:.2f})\" for t, s in terms)\n        print(f\"  {url}\")\n        print(f\"      {joined}\")\n    return 0\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
    }
}

Response

{
    "tool_response": {
        "type": "create",
        "filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/tfidf_analysis.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"Quick TF-IDF analysis from crawler DB.\n\nCorpus pro Seite: title + meta_description + h1_texts + h2_texts.\nKein Body-Text vorhanden → TF-IDF basiert auf SEO-gewichteten Feldern.\n\nUsage: python scripts\/tfidf_analysis.py <crawl_id> [top_n]\n\"\"\"\nfrom __future__ import annotations\n\nimport json\nimport math\nimport re\nimport sys\nfrom collections import Counter\nfrom typing import Any\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector  # noqa: E402\nfrom config import DB_CONFIG  # noqa: E402\n\n_STOP_DE = {\n    \"der\", \"die\", \"das\", \"den\", \"dem\", \"des\", \"und\", \"oder\", \"aber\", \"ist\",\n    \"sind\", \"war\", \"waren\", \"wird\", \"werden\", \"wurde\", \"ein\", \"eine\", \"einen\",\n    \"einem\", \"eines\", \"einer\", \"im\", \"in\", \"an\", \"auf\", \"zu\", \"zum\", \"zur\",\n    \"bei\", \"mit\", \"von\", \"vom\", \"für\", \"fuer\", \"ueber\", \"über\", \"auch\", \"als\",\n    \"nicht\", \"kein\", \"keine\", \"es\", \"er\", \"sie\", \"wir\", \"ihr\", \"du\", \"ich\",\n    \"man\", \"sich\", \"mein\", \"dein\", \"sein\", \"unser\", \"euer\", \"das\", \"dies\",\n    \"so\", \"wie\", \"wenn\", \"dann\", \"noch\", \"schon\", \"nur\", \"sehr\", \"mehr\",\n    \"gibt\", \"gibts\", \"haben\", \"hat\", \"wird\", \"kann\", \"kannst\", \"koennen\",\n    \"können\", \"dass\", \"weil\", \"durch\", \"ueber\", \"aus\", \"um\", \"nach\", \"vor\",\n    \"bis\", \"ohne\", \"gegen\", \"zwischen\", \"unter\", \"am\", \"hier\", \"dort\", \"wer\",\n    \"was\", \"welche\", \"welcher\", \"welches\", \"dich\", \"dir\", \"mich\", \"mir\",\n    \"uns\", \"euch\", \"ihre\", \"ihm\", \"ihn\", \"die\", \"the\", \"and\", \"or\", \"of\",\n    \"to\", \"a\", \"in\", \"for\", \"with\", \"on\", \"at\", \"by\", \"is\", \"are\",\n}\n_TOKEN_RE = re.compile(r\"[a-zäöüß]{3,}\", re.IGNORECASE)\n\n\ndef tokenize(text: str) -> list[str]:\n    return [t for t in _TOKEN_RE.findall(text.lower()) if t not in _STOP_DE]\n\n\ndef _corpus_for_page(row: dict[str, Any]) -> str:\n    parts: list[str] = []\n    for key in (\"title\", \"meta_description\"):\n        if row.get(key):\n            parts.append(str(row[key]))\n    for key in (\"h1_texts\", \"h2_texts\"):\n        raw = row.get(key)\n        if raw:\n            parts.extend(json.loads(raw))\n    return \" \".join(parts)\n\n\ndef main() -> int:\n    if len(sys.argv) < 2:\n        print(\"usage: tfidf_analysis.py <crawl_id> [top_n]\", file=sys.stderr)\n        return 2\n    crawl_id = int(sys.argv[1])\n    top_n = int(sys.argv[2]) if len(sys.argv) > 2 else 20\n\n    conn = mysql.connector.connect(\n        host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n        password=DB_CONFIG[\"password\"], database=\"crawler\",\n    )\n    cur = conn.cursor(dictionary=True)\n    cur.execute(\n        \"SELECT u.url, p.title, p.meta_description, p.h1_texts, p.h2_texts\"\n        \" FROM pages p JOIN urls u ON u.id=p.url_id\"\n        \" WHERE p.crawl_id=%s AND p.http_status=200\",\n        (crawl_id,),\n    )\n    rows = cur.fetchall()\n    conn.close()\n\n    docs = [(r[\"url\"], tokenize(_corpus_for_page(r))) for r in rows]\n    docs = [(u, tokens) for u, tokens in docs if tokens]\n    num_docs = len(docs)\n\n    df: Counter[str] = Counter()\n    for _, tokens in docs:\n        df.update(set(tokens))\n\n    global_scores: Counter[str] = Counter()\n    per_page_top: list[tuple[str, list[tuple[str, float]]]] = []\n    for url, tokens in docs:\n        tf = Counter(tokens)\n        doc_len = len(tokens)\n        scores: dict[str, float] = {}\n        for term, count in tf.items():\n            idf = math.log((num_docs + 1) \/ (df[term] + 1)) + 1\n            scores[term] = (count \/ doc_len) * idf\n        for term, score in scores.items():\n            global_scores[term] += score\n        top_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:3]\n        per_page_top.append((url, top_terms))\n\n    print(f\"Dokumente: {num_docs}\")\n    print(f\"Unique Terms: {len(df)}\")\n    print()\n    print(f\"=== Top {top_n} Terms (globale TF-IDF-Summe) ===\")\n    for term, score in global_scores.most_common(top_n):\n        print(f\"  {score:7.3f}  df={df[term]:3d}  {term}\")\n    print()\n    print(\"=== Top-3-Terme pro Seite (Auswahl, sortiert nach Top-Score) ===\")\n    per_page_top.sort(key=lambda x: x[1][0][1] if x[1] else 0, reverse=True)\n    for url, terms in per_page_top[:15]:\n        joined = \", \".join(f\"{t}({s:.2f})\" for t, s in terms)\n        print(f\"  {url}\")\n        print(f\"      {joined}\")\n    return 0\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n",
        "structuredPatch": [],
        "originalFile": null,
        "userModified": false
    }
}
← Vorheriger Zur Liste Nächster →