Protokoll #6768

ID6768
Zeitstempel2025-12-23 07:03:29.029519
Clientroot
IP145.224.96.190
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens4,748 (Input: 676, Output: 4,072)
Dauer109 ms
Request-Zeit2025-12-23 07:03:29.029519
Response-Zeit2025-12-23 07:03:29.138862

Request

{
    "event": "PreToolUse",
    "tool_name": "Edit",
    "tool_input": {
        "file_path": "\/opt\/scripts\/pipeline\/analyze.py",
        "old_string": "def analyze_chunk_semantics(chunk_id, content, client=None):",
        "new_string": "def link_chunk_entities(document_id):\n    \"\"\"\n    Link entities to their source chunks.\n    Scans each chunk for entity mentions and populates chunk_entities.\n    \"\"\"\n    # Get all entities for this document\n    cursor = db.execute(\n        \"\"\"SELECT DISTINCT e.id, e.name, e.canonical_name\n           FROM entities e\n           JOIN document_entities de ON e.id = de.entity_id\n           WHERE de.document_id = %s\"\"\",\n        (document_id,)\n    )\n    entities = cursor.fetchall()\n    cursor.close()\n\n    if not entities:\n        db.log(\"INFO\", f\"No entities to link for document {document_id}\")\n        return 0\n\n    # Get all chunks for this document\n    cursor = db.execute(\n        \"SELECT id, content FROM chunks WHERE document_id = %s\",\n        (document_id,)\n    )\n    chunks = cursor.fetchall()\n    cursor.close()\n\n    linked = 0\n    for chunk in chunks:\n        chunk_text = chunk[\"content\"].lower()\n\n        for entity in entities:\n            # Check if entity name appears in chunk\n            name_lower = entity[\"name\"].lower()\n            canonical = (entity[\"canonical_name\"] or \"\").lower()\n\n            mention_count = chunk_text.count(name_lower)\n            if canonical and canonical != name_lower:\n                mention_count += chunk_text.count(canonical)\n\n            if mention_count > 0:\n                # Calculate relevance score (0-1 based on mentions)\n                relevance = min(1.0, mention_count * 0.2)\n\n                try:\n                    cursor = db.execute(\n                        \"\"\"INSERT INTO chunk_entities\n                           (chunk_id, entity_id, relevance_score, mention_count)\n                           VALUES (%s, %s, %s, %s)\n                           ON DUPLICATE KEY UPDATE\n                           relevance_score = VALUES(relevance_score),\n                           mention_count = VALUES(mention_count)\"\"\",\n                        (chunk[\"id\"], entity[\"id\"], relevance, mention_count)\n                    )\n                    db.commit()\n                    cursor.close()\n                    linked += 1\n                except Exception as e:\n                    db.log(\"WARNING\", f\"Failed to link chunk {chunk['id']} to entity {entity['id']}: {e}\")\n\n    db.log(\"INFO\", f\"Linked {linked} chunk-entity pairs for document {document_id}\")\n    return linked\n\n\ndef analyze_chunk_semantics(chunk_id, content, client=None):"
    }
}

Response

{
    "tool_response": {
        "filePath": "\/opt\/scripts\/pipeline\/analyze.py",
        "oldString": "def analyze_chunk_semantics(chunk_id, content, client=None):",
        "newString": "def link_chunk_entities(document_id):\n    \"\"\"\n    Link entities to their source chunks.\n    Scans each chunk for entity mentions and populates chunk_entities.\n    \"\"\"\n    # Get all entities for this document\n    cursor = db.execute(\n        \"\"\"SELECT DISTINCT e.id, e.name, e.canonical_name\n           FROM entities e\n           JOIN document_entities de ON e.id = de.entity_id\n           WHERE de.document_id = %s\"\"\",\n        (document_id,)\n    )\n    entities = cursor.fetchall()\n    cursor.close()\n\n    if not entities:\n        db.log(\"INFO\", f\"No entities to link for document {document_id}\")\n        return 0\n\n    # Get all chunks for this document\n    cursor = db.execute(\n        \"SELECT id, content FROM chunks WHERE document_id = %s\",\n        (document_id,)\n    )\n    chunks = cursor.fetchall()\n    cursor.close()\n\n    linked = 0\n    for chunk in chunks:\n        chunk_text = chunk[\"content\"].lower()\n\n        for entity in entities:\n            # Check if entity name appears in chunk\n            name_lower = entity[\"name\"].lower()\n            canonical = (entity[\"canonical_name\"] or \"\").lower()\n\n            mention_count = chunk_text.count(name_lower)\n            if canonical and canonical != name_lower:\n                mention_count += chunk_text.count(canonical)\n\n            if mention_count > 0:\n                # Calculate relevance score (0-1 based on mentions)\n                relevance = min(1.0, mention_count * 0.2)\n\n                try:\n                    cursor = db.execute(\n                        \"\"\"INSERT INTO chunk_entities\n                           (chunk_id, entity_id, relevance_score, mention_count)\n                           VALUES (%s, %s, %s, %s)\n                           ON DUPLICATE KEY UPDATE\n                           relevance_score = VALUES(relevance_score),\n                           mention_count = VALUES(mention_count)\"\"\",\n                        (chunk[\"id\"], entity[\"id\"], relevance, mention_count)\n                    )\n                    db.commit()\n                    cursor.close()\n                    linked += 1\n                except Exception as e:\n                    db.log(\"WARNING\", f\"Failed to link chunk {chunk['id']} to entity {entity['id']}: {e}\")\n\n    db.log(\"INFO\", f\"Linked {linked} chunk-entity pairs for document {document_id}\")\n    return linked\n\n\ndef analyze_chunk_semantics(chunk_id, content, client=None):",
        "originalFile": "\"\"\"\nSemantic analysis for KI-System Pipeline\nEntity extraction, taxonomy classification, ontology mapping.\n\"\"\"\n\nimport json\nimport os\nimport time\n\nimport requests\n\nfrom config import ANTHROPIC_API_KEY, ANTHROPIC_MODEL, OLLAMA_CHAT_MODEL, OLLAMA_HOST\nfrom db import db\nfrom protokoll import protokoll\n\n\ndef get_anthropic_client():\n    \"\"\"Get Anthropic API client.\"\"\"\n    try:\n        import anthropic\n\n        if ANTHROPIC_API_KEY:\n            db.log(\"INFO\", \"Using Anthropic API (Claude)\")\n            return anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)\n        else:\n            db.log(\"WARNING\", \"No Anthropic API key found, falling back to Ollama\")\n    except ImportError:\n        db.log(\"WARNING\", \"anthropic package not installed, falling back to Ollama\")\n    return None\n\n\ndef extract_entities_ollama(text, model=\"gemma3:27b-it-qat\"):\n    \"\"\"Extract entities using Ollama.\"\"\"\n    # Load prompt from database\n    prompt_template = db.get_prompt(\"entity_extraction\")\n\n    if not prompt_template:\n        db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n        prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])\n\n    try:\n        start_time = time.time()\n        response = requests.post(\n            f\"{OLLAMA_HOST}\/api\/generate\",\n            json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n            timeout=120,\n        )\n        response.raise_for_status()\n        data = response.json()\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Parse JSON from response\n        response_text = data.get(\"response\", \"{}\")\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=f\"ollama:{model}\",\n            tokens_input=data.get(\"prompt_eval_count\", 0),\n            tokens_output=data.get(\"eval_count\", 0),\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        try:\n            entities = json.loads(response_text)\n            return entities.get(\"entities\", [])\n        except json.JSONDecodeError:\n            db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n            return []\n    except Exception as e:\n        db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            model_name=f\"ollama:{model}\",\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef extract_entities_anthropic(text, client):\n    \"\"\"Extract entities using Anthropic Claude.\"\"\"\n    # Get prompt from database\n    prompt_template = db.get_prompt(\"entity_extraction\")\n\n    if not prompt_template:\n        prompt_template = \"\"\"Analysiere den folgenden deutschen Text und extrahiere alle wichtigen Entitäten.\n\nKategorisiere jede Entität als:\n- PERSON (Namen von Personen)\n- ORGANIZATION (Firmen, Institutionen, Gruppen)\n- CONCEPT (Fachbegriffe, Methoden, Theorien)\n- LOCATION (Orte, Länder)\n- DATE (Zeitangaben)\n- OTHER (Sonstiges)\n\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"context\": \"kurzer Kontext der Erwähnung\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:4000])\n\n    try:\n        start_time = time.time()\n        message = client.messages.create(\n            model=ANTHROPIC_MODEL, max_tokens=2000, messages=[{\"role\": \"user\", \"content\": prompt}]\n        )\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        response_text = message.content[0].text\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=ANTHROPIC_MODEL,\n            tokens_input=message.usage.input_tokens,\n            tokens_output=message.usage.output_tokens,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        # Extract JSON from response\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_match:\n            entities = json.loads(json_match.group())\n            return entities.get(\"entities\", [])\n        return []\n    except Exception as e:\n        db.log(\"ERROR\", f\"Anthropic entity extraction failed: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            model_name=ANTHROPIC_MODEL,\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef extract_relations(text, entities, client=None):\n    \"\"\"Extract relations between entities.\"\"\"\n    if not entities or len(entities) < 2:\n        return []\n\n    entity_names = [e[\"name\"] for e in entities[:20]]\n\n    # Load prompt from database\n    prompt_template = db.get_prompt(\"relation_extraction\")\n\n    if not prompt_template:\n        db.log(\"WARNING\", \"relation_extraction prompt not found in DB, using fallback\")\n        prompt_template = \"\"\"Identifiziere Beziehungen zwischen Entitäten.\nEntitäten: {{ENTITIES}}\nBeziehungstypen: DEVELOPED_BY, RELATED_TO, PART_OF, USED_IN, BASED_ON\nAntworte NUR im JSON-Format:\n{\"relations\": [{\"source\": \"...\", \"relation\": \"...\", \"target\": \"...\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{ENTITIES}}\", \", \".join(entity_names))\n    prompt = prompt.replace(\"{{TEXT}}\", text[:3000])\n\n    try:\n        start_time = time.time()\n        tokens_in, tokens_out = 0, 0\n        model_name = \"\"\n\n        if client:\n            message = client.messages.create(\n                model=ANTHROPIC_MODEL, max_tokens=1000, messages=[{\"role\": \"user\", \"content\": prompt}]\n            )\n            response_text = message.content[0].text\n            tokens_in = message.usage.input_tokens\n            tokens_out = message.usage.output_tokens\n            model_name = ANTHROPIC_MODEL\n        else:\n            response = requests.post(\n                f\"{OLLAMA_HOST}\/api\/generate\",\n                json={\"model\": OLLAMA_CHAT_MODEL, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n                timeout=120,\n            )\n            response.raise_for_status()\n            data = response.json()\n            response_text = data.get(\"response\", \"{}\")\n            tokens_in = data.get(\"prompt_eval_count\", 0)\n            tokens_out = data.get(\"eval_count\", 0)\n            model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[relation_extraction] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=model_name,\n            tokens_input=tokens_in,\n            tokens_output=tokens_out,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_match:\n            data = json.loads(json_match.group())\n            return data.get(\"relations\", [])\n        return []\n    except Exception as e:\n        db.log(\"ERROR\", f\"Relation extraction failed: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[relation_extraction] {prompt[:500]}...\",\n            model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef classify_taxonomy(text, client=None):\n    \"\"\"Classify text into taxonomy categories.\"\"\"\n    prompt_template = db.get_prompt(\"taxonomy_classification\")\n\n    if not prompt_template:\n        prompt_template = \"\"\"Klassifiziere den folgenden Text in passende Kategorien.\n\nWähle aus diesen Hauptkategorien:\n- Methoden (Therapiemethoden, Techniken)\n- Theorie (Konzepte, Modelle, Grundlagen)\n- Praxis (Anwendung, Fallbeispiele)\n- Organisation (Strukturen, Prozesse)\n- Kommunikation (Gesprächsführung, Interaktion)\n- Entwicklung (Persönliche Entwicklung, Veränderung)\n\nAntworte NUR im JSON-Format:\n{\"categories\": [\"...\", \"...\"], \"confidence\": 0.0-1.0}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:2000])\n\n    try:\n        start_time = time.time()\n        tokens_in, tokens_out = 0, 0\n        model_name = \"\"\n\n        if client:\n            message = client.messages.create(\n                model=ANTHROPIC_MODEL, max_tokens=500, messages=[{\"role\": \"user\", \"content\": prompt}]\n            )\n            response_text = message.content[0].text\n            tokens_in = message.usage.input_tokens\n            tokens_out = message.usage.output_tokens\n            model_name = ANTHROPIC_MODEL\n        else:\n            response = requests.post(\n                f\"{OLLAMA_HOST}\/api\/generate\",\n                json={\"model\": OLLAMA_CHAT_MODEL, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n                timeout=60,\n            )\n            response.raise_for_status()\n            data = response.json()\n            response_text = data.get(\"response\", \"{}\")\n            tokens_in = data.get(\"prompt_eval_count\", 0)\n            tokens_out = data.get(\"eval_count\", 0)\n            model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[taxonomy_classification] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=model_name,\n            tokens_input=tokens_in,\n            tokens_output=tokens_out,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_m... [TRUNCATED-ea7ebeba9a44a434]",
        "structuredPatch": [
            {
                "oldStart": 461,
                "oldLines": 6,
                "newStart": 461,
                "newLines": 71,
                "lines": [
                    "     }",
                    " ",
                    " ",
                    "+def link_chunk_entities(document_id):",
                    "+    \"\"\"",
                    "+    Link entities to their source chunks.",
                    "+    Scans each chunk for entity mentions and populates chunk_entities.",
                    "+    \"\"\"",
                    "+    # Get all entities for this document",
                    "+    cursor = db.execute(",
                    "+        \"\"\"SELECT DISTINCT e.id, e.name, e.canonical_name",
                    "+           FROM entities e",
                    "+           JOIN document_entities de ON e.id = de.entity_id",
                    "+           WHERE de.document_id = %s\"\"\",",
                    "+        (document_id,)",
                    "+    )",
                    "+    entities = cursor.fetchall()",
                    "+    cursor.close()",
                    "+",
                    "+    if not entities:",
                    "+        db.log(\"INFO\", f\"No entities to link for document {document_id}\")",
                    "+        return 0",
                    "+",
                    "+    # Get all chunks for this document",
                    "+    cursor = db.execute(",
                    "+        \"SELECT id, content FROM chunks WHERE document_id = %s\",",
                    "+        (document_id,)",
                    "+    )",
                    "+    chunks = cursor.fetchall()",
                    "+    cursor.close()",
                    "+",
                    "+    linked = 0",
                    "+    for chunk in chunks:",
                    "+        chunk_text = chunk[\"content\"].lower()",
                    "+",
                    "+        for entity in entities:",
                    "+            # Check if entity name appears in chunk",
                    "+            name_lower = entity[\"name\"].lower()",
                    "+            canonical = (entity[\"canonical_name\"] or \"\").lower()",
                    "+",
                    "+            mention_count = chunk_text.count(name_lower)",
                    "+            if canonical and canonical != name_lower:",
                    "+                mention_count += chunk_text.count(canonical)",
                    "+",
                    "+            if mention_count > 0:",
                    "+                # Calculate relevance score (0-1 based on mentions)",
                    "+                relevance = min(1.0, mention_count * 0.2)",
                    "+",
                    "+                try:",
                    "+                    cursor = db.execute(",
                    "+                        \"\"\"INSERT INTO chunk_entities",
                    "+                           (chunk_id, entity_id, relevance_score, mention_count)",
                    "+                           VALUES (%s, %s, %s, %s)",
                    "+                           ON DUPLICATE KEY UPDATE",
                    "+                           relevance_score = VALUES(relevance_score),",
                    "+                           mention_count = VALUES(mention_count)\"\"\",",
                    "+                        (chunk[\"id\"], entity[\"id\"], relevance, mention_count)",
                    "+                    )",
                    "+                    db.commit()",
                    "+                    cursor.close()",
                    "+                    linked += 1",
                    "+                except Exception as e:",
                    "+                    db.log(\"WARNING\", f\"Failed to link chunk {chunk['id']} to entity {entity['id']}: {e}\")",
                    "+",
                    "+    db.log(\"INFO\", f\"Linked {linked} chunk-entity pairs for document {document_id}\")",
                    "+    return linked",
                    "+",
                    "+",
                    " def analyze_chunk_semantics(chunk_id, content, client=None):",
                    "     \"\"\"",
                    "     Analyze a single chunk for semantics (summary, keywords, sentiment, topics)."
                ]
            }
        ],
        "userModified": false,
        "replaceAll": false
    }
}
← Vorheriger Zur Liste Nächster →