Protokoll #13030

ID13030
Zeitstempel2025-12-24 15:13:29.603688
Clientroot
IP145.224.72.140
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens10,276 (Input: 2,546, Output: 7,730)
Dauer2780980 ms
Request-Zeit2025-12-24 15:13:29.603688
Response-Zeit2025-12-24 15:59:50.584019

Request

{
    "event": "PreToolUse",
    "tool_name": "Edit",
    "tool_input": {
        "file_path": "\/var\/www\/scripts\/pipeline\/analyze.py",
        "old_string": "if __name__ == \"__main__\":\n    test_text = \"\"\"\n    Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie.\n    Diese Methode basiert auf Empathie, Kongruenz und bedingungsloser Wertschätzung.\n    Sie wird heute weltweit in der Psychotherapie eingesetzt.\n    \"\"\"\n\n    print(\"Testing entity extraction with Ollama...\")\n    entities = extract_entities_ollama(test_text)\n    print(f\"Found {len(entities)} entities:\")\n    for e in entities:\n        print(f\"  - {e['name']} ({e['type']})\")",
        "new_string": "def extract_statements(chunk_id: int, text: str, client=None) -> list[dict]:\n    \"\"\"\n    Extract SPO-triplets (Subject-Predicate-Object statements) from text.\n\n    Args:\n        chunk_id: ID of the chunk being analyzed\n        text: Text content to extract statements from\n        client: Optional Anthropic client (falls back to Ollama if None)\n\n    Returns:\n        List of extracted statements with entity linking\n    \"\"\"\n    prompt_template = db.get_prompt(\"statement_extraction\")\n\n    if not prompt_template:\n        db.log(\"WARNING\", \"statement_extraction prompt not found in DB, using fallback\")\n        prompt_template = \"\"\"Extrahiere alle faktischen Aussagen aus dem Text als SPO-Tripel.\n\nRegeln:\n- Subject: Eine benannte Entität (Person, Organisation, Konzept, Methode)\n- Predicate: Die Beziehung oder Eigenschaft (z.B. \"entwickelte\", \"basiert auf\", \"ist Teil von\")\n- Object: Eine Entität oder ein Literal-Wert\n\nAntworte NUR im JSON-Format:\n{\"statements\": [\n  {\"subject\": \"Name der Subject-Entität\", \"predicate\": \"Beziehung\", \"object\": \"Name oder Wert\", \"confidence\": 0.0-1.0}\n]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])\n\n    try:\n        start_time = time.time()\n        tokens_in, tokens_out = 0, 0\n        model_name = \"\"\n\n        if client:\n            message = client.messages.create(\n                model=ANTHROPIC_MODEL,\n                max_tokens=1500,\n                messages=[{\"role\": \"user\", \"content\": prompt}],\n            )\n            response_text = message.content[0].text\n            tokens_in = message.usage.input_tokens\n            tokens_out = message.usage.output_tokens\n            model_name = ANTHROPIC_MODEL\n        else:\n            response = requests.post(\n                f\"{OLLAMA_HOST}\/api\/generate\",\n                json={\n                    \"model\": OLLAMA_CHAT_MODEL,\n                    \"prompt\": prompt,\n                    \"stream\": False,\n                    \"format\": \"json\",\n                },\n                timeout=120,\n            )\n            response.raise_for_status()\n            data = response.json()\n            response_text = data.get(\"response\", \"{}\")\n            tokens_in = data.get(\"prompt_eval_count\", 0)\n            tokens_out = data.get(\"eval_count\", 0)\n            model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[statement_extraction] chunk_id={chunk_id}\",\n            response=response_text[:2000],\n            model_name=model_name,\n            tokens_input=tokens_in,\n            tokens_output=tokens_out,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        # Parse JSON\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_match:\n            result = json.loads(json_match.group())\n            return result.get(\"statements\", [])\n        return []\n\n    except Exception as e:\n        db.log(\"ERROR\", f\"Statement extraction failed for chunk {chunk_id}: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[statement_extraction] chunk_id={chunk_id}\",\n            model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef find_entity_by_name(name: str) -> dict | None:\n    \"\"\"\n    Find entity by name or canonical_name.\n\n    Args:\n        name: Entity name to search for\n\n    Returns:\n        Entity dict with id, name, canonical_name or None\n    \"\"\"\n    name_lower = name.lower().strip()\n    canonical = name_lower.replace(\" \", \"\").replace(\"-\", \"\")\n\n    # Try exact canonical match first\n    cursor = db.execute(\n        \"SELECT id, name, canonical_name FROM entities WHERE canonical_name = %s LIMIT 1\",\n        (canonical,),\n    )\n    result = cursor.fetchone()\n    cursor.close()\n    if result:\n        return result\n\n    # Try name match (case-insensitive)\n    cursor = db.execute(\n        \"SELECT id, name, canonical_name FROM entities WHERE LOWER(name) = %s LIMIT 1\",\n        (name_lower,),\n    )\n    result = cursor.fetchone()\n    cursor.close()\n    if result:\n        return result\n\n    # Try partial canonical match\n    cursor = db.execute(\n        \"SELECT id, name, canonical_name FROM entities WHERE canonical_name LIKE %s LIMIT 1\",\n        (f\"%{canonical}%\",),\n    )\n    result = cursor.fetchone()\n    cursor.close()\n    return result\n\n\ndef store_statements(chunk_id: int, statements: list[dict]) -> int:\n    \"\"\"\n    Store extracted statements in the database with entity linking.\n\n    Args:\n        chunk_id: ID of the source chunk\n        statements: List of statement dicts with subject, predicate, object, confidence\n\n    Returns:\n        Number of successfully stored statements\n    \"\"\"\n    stored = 0\n\n    for stmt in statements:\n        try:\n            subject_name = stmt.get(\"subject\", \"\").strip()\n            predicate = stmt.get(\"predicate\", \"\").strip()\n            object_value = stmt.get(\"object\", \"\").strip()\n            confidence = float(stmt.get(\"confidence\", 0.8))\n\n            if not subject_name or not predicate:\n                continue\n\n            # Find subject entity\n            subject_entity = find_entity_by_name(subject_name)\n            if not subject_entity:\n                db.log(\"DEBUG\", f\"Subject entity not found: {subject_name}\")\n                continue  # Subject must be a known entity\n\n            subject_entity_id = subject_entity[\"id\"]\n\n            # Try to find object entity, otherwise use as literal\n            object_entity_id = None\n            object_literal = None\n\n            if object_value:\n                object_entity = find_entity_by_name(object_value)\n                if object_entity:\n                    object_entity_id = object_entity[\"id\"]\n                else:\n                    object_literal = object_value\n\n            # Insert statement\n            cursor = db.execute(\n                \"\"\"INSERT INTO statements\n                   (subject_entity_id, predicate, object_entity_id, object_literal,\n                    chunk_id, confidence, status, created_at)\n                   VALUES (%s, %s, %s, %s, %s, %s, 'extracted', NOW())\"\"\",\n                (\n                    subject_entity_id,\n                    predicate[:255],  # Truncate to VARCHAR(255)\n                    object_entity_id,\n                    object_literal,\n                    chunk_id,\n                    confidence,\n                ),\n            )\n            db.commit()\n            cursor.close()\n            stored += 1\n\n        except Exception as e:\n            db.log(\"WARNING\", f\"Failed to store statement: {e}\")\n\n    return stored\n\n\ndef analyze_chunk_statements(chunk_id: int, content: str, client=None) -> int:\n    \"\"\"\n    Extract and store statements for a single chunk.\n\n    Args:\n        chunk_id: ID of the chunk\n        content: Text content of the chunk\n        client: Optional Anthropic client\n\n    Returns:\n        Number of statements stored\n    \"\"\"\n    statements = extract_statements(chunk_id, content, client)\n    if statements:\n        stored = store_statements(chunk_id, statements)\n        db.log(\"INFO\", f\"Chunk {chunk_id}: {stored}\/{len(statements)} statements stored\")\n        return stored\n    return 0\n\n\ndef analyze_document_statements(document_id: int, client=None, progress=None) -> int:\n    \"\"\"\n    Extract statements from all chunks of a document.\n\n    Args:\n        document_id: ID of the document\n        client: Optional Anthropic client\n        progress: Optional PipelineProgress for logging\n\n    Returns:\n        Total number of statements stored\n    \"\"\"\n    cursor = db.execute(\n        \"SELECT id, content FROM chunks WHERE document_id = %s ORDER BY chunk_index\",\n        (document_id,),\n    )\n    chunks = cursor.fetchall()\n    cursor.close()\n\n    total = len(chunks)\n    db.log(\"INFO\", f\"Extracting statements from {total} chunks\")\n    if progress:\n        progress.add_log(f\"Statements: Extrahiere aus {total} Chunks...\")\n\n    total_stored = 0\n    for i, chunk in enumerate(chunks, 1):\n        if progress:\n            progress.add_log(f\"Statements: Chunk {i}\/{total}...\")\n\n        stored = analyze_chunk_statements(chunk[\"id\"], chunk[\"content\"], client)\n        total_stored += stored\n\n    db.log(\"INFO\", f\"Total statements stored: {total_stored}\")\n    if progress:\n        progress.add_log(f\"Statements: {total_stored} Aussagen extrahiert\")\n\n    return total_stored\n\n\nif __name__ == \"__main__\":\n    test_text = \"\"\"\n    Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie.\n    Diese Methode basiert auf Empathie, Kongruenz und bedingungsloser Wertschätzung.\n    Sie wird heute weltweit in der Psychotherapie eingesetzt.\n    \"\"\"\n\n    print(\"Testing entity extraction with Ollama...\")\n    entities = extract_entities_ollama(test_text)\n    print(f\"Found {len(entities)} entities:\")\n    for e in entities:\n        print(f\"  - {e['name']} ({e['type']})\")"
    }
}

Response

{
    "tool_response": {
        "filePath": "\/var\/www\/scripts\/pipeline\/analyze.py",
        "oldString": "if __name__ == \"__main__\":\n    test_text = \"\"\"\n    Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie.\n    Diese Methode basiert auf Empathie, Kongruenz und bedingungsloser Wertschätzung.\n    Sie wird heute weltweit in der Psychotherapie eingesetzt.\n    \"\"\"\n\n    print(\"Testing entity extraction with Ollama...\")\n    entities = extract_entities_ollama(test_text)\n    print(f\"Found {len(entities)} entities:\")\n    for e in entities:\n        print(f\"  - {e['name']} ({e['type']})\")",
        "newString": "def extract_statements(chunk_id: int, text: str, client=None) -> list[dict]:\n    \"\"\"\n    Extract SPO-triplets (Subject-Predicate-Object statements) from text.\n\n    Args:\n        chunk_id: ID of the chunk being analyzed\n        text: Text content to extract statements from\n        client: Optional Anthropic client (falls back to Ollama if None)\n\n    Returns:\n        List of extracted statements with entity linking\n    \"\"\"\n    prompt_template = db.get_prompt(\"statement_extraction\")\n\n    if not prompt_template:\n        db.log(\"WARNING\", \"statement_extraction prompt not found in DB, using fallback\")\n        prompt_template = \"\"\"Extrahiere alle faktischen Aussagen aus dem Text als SPO-Tripel.\n\nRegeln:\n- Subject: Eine benannte Entität (Person, Organisation, Konzept, Methode)\n- Predicate: Die Beziehung oder Eigenschaft (z.B. \"entwickelte\", \"basiert auf\", \"ist Teil von\")\n- Object: Eine Entität oder ein Literal-Wert\n\nAntworte NUR im JSON-Format:\n{\"statements\": [\n  {\"subject\": \"Name der Subject-Entität\", \"predicate\": \"Beziehung\", \"object\": \"Name oder Wert\", \"confidence\": 0.0-1.0}\n]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])\n\n    try:\n        start_time = time.time()\n        tokens_in, tokens_out = 0, 0\n        model_name = \"\"\n\n        if client:\n            message = client.messages.create(\n                model=ANTHROPIC_MODEL,\n                max_tokens=1500,\n                messages=[{\"role\": \"user\", \"content\": prompt}],\n            )\n            response_text = message.content[0].text\n            tokens_in = message.usage.input_tokens\n            tokens_out = message.usage.output_tokens\n            model_name = ANTHROPIC_MODEL\n        else:\n            response = requests.post(\n                f\"{OLLAMA_HOST}\/api\/generate\",\n                json={\n                    \"model\": OLLAMA_CHAT_MODEL,\n                    \"prompt\": prompt,\n                    \"stream\": False,\n                    \"format\": \"json\",\n                },\n                timeout=120,\n            )\n            response.raise_for_status()\n            data = response.json()\n            response_text = data.get(\"response\", \"{}\")\n            tokens_in = data.get(\"prompt_eval_count\", 0)\n            tokens_out = data.get(\"eval_count\", 0)\n            model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[statement_extraction] chunk_id={chunk_id}\",\n            response=response_text[:2000],\n            model_name=model_name,\n            tokens_input=tokens_in,\n            tokens_output=tokens_out,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        # Parse JSON\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_match:\n            result = json.loads(json_match.group())\n            return result.get(\"statements\", [])\n        return []\n\n    except Exception as e:\n        db.log(\"ERROR\", f\"Statement extraction failed for chunk {chunk_id}: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[statement_extraction] chunk_id={chunk_id}\",\n            model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef find_entity_by_name(name: str) -> dict | None:\n    \"\"\"\n    Find entity by name or canonical_name.\n\n    Args:\n        name: Entity name to search for\n\n    Returns:\n        Entity dict with id, name, canonical_name or None\n    \"\"\"\n    name_lower = name.lower().strip()\n    canonical = name_lower.replace(\" \", \"\").replace(\"-\", \"\")\n\n    # Try exact canonical match first\n    cursor = db.execute(\n        \"SELECT id, name, canonical_name FROM entities WHERE canonical_name = %s LIMIT 1\",\n        (canonical,),\n    )\n    result = cursor.fetchone()\n    cursor.close()\n    if result:\n        return result\n\n    # Try name match (case-insensitive)\n    cursor = db.execute(\n        \"SELECT id, name, canonical_name FROM entities WHERE LOWER(name) = %s LIMIT 1\",\n        (name_lower,),\n    )\n    result = cursor.fetchone()\n    cursor.close()\n    if result:\n        return result\n\n    # Try partial canonical match\n    cursor = db.execute(\n        \"SELECT id, name, canonical_name FROM entities WHERE canonical_name LIKE %s LIMIT 1\",\n        (f\"%{canonical}%\",),\n    )\n    result = cursor.fetchone()\n    cursor.close()\n    return result\n\n\ndef store_statements(chunk_id: int, statements: list[dict]) -> int:\n    \"\"\"\n    Store extracted statements in the database with entity linking.\n\n    Args:\n        chunk_id: ID of the source chunk\n        statements: List of statement dicts with subject, predicate, object, confidence\n\n    Returns:\n        Number of successfully stored statements\n    \"\"\"\n    stored = 0\n\n    for stmt in statements:\n        try:\n            subject_name = stmt.get(\"subject\", \"\").strip()\n            predicate = stmt.get(\"predicate\", \"\").strip()\n            object_value = stmt.get(\"object\", \"\").strip()\n            confidence = float(stmt.get(\"confidence\", 0.8))\n\n            if not subject_name or not predicate:\n                continue\n\n            # Find subject entity\n            subject_entity = find_entity_by_name(subject_name)\n            if not subject_entity:\n                db.log(\"DEBUG\", f\"Subject entity not found: {subject_name}\")\n                continue  # Subject must be a known entity\n\n            subject_entity_id = subject_entity[\"id\"]\n\n            # Try to find object entity, otherwise use as literal\n            object_entity_id = None\n            object_literal = None\n\n            if object_value:\n                object_entity = find_entity_by_name(object_value)\n                if object_entity:\n                    object_entity_id = object_entity[\"id\"]\n                else:\n                    object_literal = object_value\n\n            # Insert statement\n            cursor = db.execute(\n                \"\"\"INSERT INTO statements\n                   (subject_entity_id, predicate, object_entity_id, object_literal,\n                    chunk_id, confidence, status, created_at)\n                   VALUES (%s, %s, %s, %s, %s, %s, 'extracted', NOW())\"\"\",\n                (\n                    subject_entity_id,\n                    predicate[:255],  # Truncate to VARCHAR(255)\n                    object_entity_id,\n                    object_literal,\n                    chunk_id,\n                    confidence,\n                ),\n            )\n            db.commit()\n            cursor.close()\n            stored += 1\n\n        except Exception as e:\n            db.log(\"WARNING\", f\"Failed to store statement: {e}\")\n\n    return stored\n\n\ndef analyze_chunk_statements(chunk_id: int, content: str, client=None) -> int:\n    \"\"\"\n    Extract and store statements for a single chunk.\n\n    Args:\n        chunk_id: ID of the chunk\n        content: Text content of the chunk\n        client: Optional Anthropic client\n\n    Returns:\n        Number of statements stored\n    \"\"\"\n    statements = extract_statements(chunk_id, content, client)\n    if statements:\n        stored = store_statements(chunk_id, statements)\n        db.log(\"INFO\", f\"Chunk {chunk_id}: {stored}\/{len(statements)} statements stored\")\n        return stored\n    return 0\n\n\ndef analyze_document_statements(document_id: int, client=None, progress=None) -> int:\n    \"\"\"\n    Extract statements from all chunks of a document.\n\n    Args:\n        document_id: ID of the document\n        client: Optional Anthropic client\n        progress: Optional PipelineProgress for logging\n\n    Returns:\n        Total number of statements stored\n    \"\"\"\n    cursor = db.execute(\n        \"SELECT id, content FROM chunks WHERE document_id = %s ORDER BY chunk_index\",\n        (document_id,),\n    )\n    chunks = cursor.fetchall()\n    cursor.close()\n\n    total = len(chunks)\n    db.log(\"INFO\", f\"Extracting statements from {total} chunks\")\n    if progress:\n        progress.add_log(f\"Statements: Extrahiere aus {total} Chunks...\")\n\n    total_stored = 0\n    for i, chunk in enumerate(chunks, 1):\n        if progress:\n            progress.add_log(f\"Statements: Chunk {i}\/{total}...\")\n\n        stored = analyze_chunk_statements(chunk[\"id\"], chunk[\"content\"], client)\n        total_stored += stored\n\n    db.log(\"INFO\", f\"Total statements stored: {total_stored}\")\n    if progress:\n        progress.add_log(f\"Statements: {total_stored} Aussagen extrahiert\")\n\n    return total_stored\n\n\nif __name__ == \"__main__\":\n    test_text = \"\"\"\n    Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie.\n    Diese Methode basiert auf Empathie, Kongruenz und bedingungsloser Wertschätzung.\n    Sie wird heute weltweit in der Psychotherapie eingesetzt.\n    \"\"\"\n\n    print(\"Testing entity extraction with Ollama...\")\n    entities = extract_entities_ollama(test_text)\n    print(f\"Found {len(entities)} entities:\")\n    for e in entities:\n        print(f\"  - {e['name']} ({e['type']})\")",
        "originalFile": "\"\"\"\nSemantic analysis for KI-System Pipeline\nEntity extraction, taxonomy classification, ontology mapping.\n\"\"\"\n\nimport json\nimport os\nimport time\n\nimport requests\n\nfrom config import ANTHROPIC_API_KEY, ANTHROPIC_MODEL, OLLAMA_CHAT_MODEL, OLLAMA_HOST\nfrom db import db\nfrom protokoll import protokoll\n\n\ndef get_anthropic_client():\n    \"\"\"Get Anthropic API client.\"\"\"\n    try:\n        import anthropic\n\n        if ANTHROPIC_API_KEY:\n            db.log(\"INFO\", \"Using Anthropic API (Claude)\")\n            return anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)\n        else:\n            db.log(\"WARNING\", \"No Anthropic API key found, falling back to Ollama\")\n    except ImportError:\n        db.log(\"WARNING\", \"anthropic package not installed, falling back to Ollama\")\n    return None\n\n\ndef extract_entities_ollama(text, model=\"gemma3:27b-it-qat\"):\n    \"\"\"Extract entities using Ollama.\"\"\"\n    # Load prompt from database\n    prompt_template = db.get_prompt(\"entity_extraction\")\n\n    if not prompt_template:\n        db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n        prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])\n\n    try:\n        start_time = time.time()\n        response = requests.post(\n            f\"{OLLAMA_HOST}\/api\/generate\",\n            json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n            timeout=120,\n        )\n        response.raise_for_status()\n        data = response.json()\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Parse JSON from response\n        response_text = data.get(\"response\", \"{}\")\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=f\"ollama:{model}\",\n            tokens_input=data.get(\"prompt_eval_count\", 0),\n            tokens_output=data.get(\"eval_count\", 0),\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        try:\n            entities = json.loads(response_text)\n            return entities.get(\"entities\", [])\n        except json.JSONDecodeError:\n            db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n            return []\n    except Exception as e:\n        db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            model_name=f\"ollama:{model}\",\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef extract_entities_anthropic(text, client):\n    \"\"\"Extract entities using Anthropic Claude.\"\"\"\n    # Get prompt from database\n    prompt_template = db.get_prompt(\"entity_extraction\")\n\n    if not prompt_template:\n        prompt_template = \"\"\"Analysiere den folgenden deutschen Text und extrahiere alle wichtigen Entitäten.\n\nKategorisiere jede Entität als:\n- PERSON (Namen von Personen)\n- ORGANIZATION (Firmen, Institutionen, Gruppen)\n- CONCEPT (Fachbegriffe, Methoden, Theorien)\n- LOCATION (Orte, Länder)\n- DATE (Zeitangaben)\n- OTHER (Sonstiges)\n\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"context\": \"kurzer Kontext der Erwähnung\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:4000])\n\n    try:\n        start_time = time.time()\n        message = client.messages.create(\n            model=ANTHROPIC_MODEL, max_tokens=2000, messages=[{\"role\": \"user\", \"content\": prompt}]\n        )\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        response_text = message.content[0].text\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=ANTHROPIC_MODEL,\n            tokens_input=message.usage.input_tokens,\n            tokens_output=message.usage.output_tokens,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        # Extract JSON from response\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_match:\n            entities = json.loads(json_match.group())\n            return entities.get(\"entities\", [])\n        return []\n    except Exception as e:\n        db.log(\"ERROR\", f\"Anthropic entity extraction failed: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[entity_extraction] {prompt[:500]}...\",\n            model_name=ANTHROPIC_MODEL,\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef extract_relations(text, entities, client=None):\n    \"\"\"Extract relations between entities.\"\"\"\n    if not entities or len(entities) < 2:\n        return []\n\n    entity_names = [e[\"name\"] for e in entities[:20]]\n\n    # Load prompt from database\n    prompt_template = db.get_prompt(\"relation_extraction\")\n\n    if not prompt_template:\n        db.log(\"WARNING\", \"relation_extraction prompt not found in DB, using fallback\")\n        prompt_template = \"\"\"Identifiziere Beziehungen zwischen Entitäten.\nEntitäten: {{ENTITIES}}\nBeziehungstypen: DEVELOPED_BY, RELATED_TO, PART_OF, USED_IN, BASED_ON\nAntworte NUR im JSON-Format:\n{\"relations\": [{\"source\": \"...\", \"relation\": \"...\", \"target\": \"...\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{ENTITIES}}\", \", \".join(entity_names))\n    prompt = prompt.replace(\"{{TEXT}}\", text[:3000])\n\n    try:\n        start_time = time.time()\n        tokens_in, tokens_out = 0, 0\n        model_name = \"\"\n\n        if client:\n            message = client.messages.create(\n                model=ANTHROPIC_MODEL, max_tokens=1000, messages=[{\"role\": \"user\", \"content\": prompt}]\n            )\n            response_text = message.content[0].text\n            tokens_in = message.usage.input_tokens\n            tokens_out = message.usage.output_tokens\n            model_name = ANTHROPIC_MODEL\n        else:\n            response = requests.post(\n                f\"{OLLAMA_HOST}\/api\/generate\",\n                json={\"model\": OLLAMA_CHAT_MODEL, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n                timeout=120,\n            )\n            response.raise_for_status()\n            data = response.json()\n            response_text = data.get(\"response\", \"{}\")\n            tokens_in = data.get(\"prompt_eval_count\", 0)\n            tokens_out = data.get(\"eval_count\", 0)\n            model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[relation_extraction] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=model_name,\n            tokens_input=tokens_in,\n            tokens_output=tokens_out,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_match:\n            data = json.loads(json_match.group())\n            return data.get(\"relations\", [])\n        return []\n    except Exception as e:\n        db.log(\"ERROR\", f\"Relation extraction failed: {e}\")\n        protokoll.log_llm_call(\n            request=f\"[relation_extraction] {prompt[:500]}...\",\n            model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",\n            status=\"error\",\n            error_message=str(e),\n        )\n        return []\n\n\ndef classify_taxonomy(text, client=None):\n    \"\"\"Classify text into taxonomy categories.\"\"\"\n    prompt_template = db.get_prompt(\"taxonomy_classification\")\n\n    if not prompt_template:\n        prompt_template = \"\"\"Klassifiziere den folgenden Text in passende Kategorien.\n\nWähle aus diesen Hauptkategorien:\n- Methoden (Therapiemethoden, Techniken)\n- Theorie (Konzepte, Modelle, Grundlagen)\n- Praxis (Anwendung, Fallbeispiele)\n- Organisation (Strukturen, Prozesse)\n- Kommunikation (Gesprächsführung, Interaktion)\n- Entwicklung (Persönliche Entwicklung, Veränderung)\n\nAntworte NUR im JSON-Format:\n{\"categories\": [\"...\", \"...\"], \"confidence\": 0.0-1.0}\n\nText:\n{{TEXT}}\"\"\"\n\n    prompt = prompt_template.replace(\"{{TEXT}}\", text[:2000])\n\n    try:\n        start_time = time.time()\n        tokens_in, tokens_out = 0, 0\n        model_name = \"\"\n\n        if client:\n            message = client.messages.create(\n                model=ANTHROPIC_MODEL, max_tokens=500, messages=[{\"role\": \"user\", \"content\": prompt}]\n            )\n            response_text = message.content[0].text\n            tokens_in = message.usage.input_tokens\n            tokens_out = message.usage.output_tokens\n            model_name = ANTHROPIC_MODEL\n        else:\n            response = requests.post(\n                f\"{OLLAMA_HOST}\/api\/generate\",\n                json={\"model\": OLLAMA_CHAT_MODEL, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n                timeout=60,\n            )\n            response.raise_for_status()\n            data = response.json()\n            response_text = data.get(\"response\", \"{}\")\n            tokens_in = data.get(\"prompt_eval_count\", 0)\n            tokens_out = data.get(\"eval_count\", 0)\n            model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n        duration_ms = int((time.time() - start_time) * 1000)\n\n        # Log to ki-protokoll\n        protokoll.log_llm_call(\n            request=f\"[taxonomy_classification] {prompt[:500]}...\",\n            response=response_text[:2000],\n            model_name=model_name,\n            tokens_input=tokens_in,\n            tokens_output=tokens_out,\n            duration_ms=duration_ms,\n            status=\"completed\",\n        )\n\n        import re\n\n        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n        if json_m... [TRUNCATED-563c47d6afac0dae]",
        "structuredPatch": [
            {
                "oldStart": 665,
                "oldLines": 6,
                "newStart": 665,
                "newLines": 274,
                "lines": [
                    "     return analyzed",
                    " ",
                    " ",
                    "+def extract_statements(chunk_id: int, text: str, client=None) -> list[dict]:",
                    "+    \"\"\"",
                    "+    Extract SPO-triplets (Subject-Predicate-Object statements) from text.",
                    "+",
                    "+    Args:",
                    "+        chunk_id: ID of the chunk being analyzed",
                    "+        text: Text content to extract statements from",
                    "+        client: Optional Anthropic client (falls back to Ollama if None)",
                    "+",
                    "+    Returns:",
                    "+        List of extracted statements with entity linking",
                    "+    \"\"\"",
                    "+    prompt_template = db.get_prompt(\"statement_extraction\")",
                    "+",
                    "+    if not prompt_template:",
                    "+        db.log(\"WARNING\", \"statement_extraction prompt not found in DB, using fallback\")",
                    "+        prompt_template = \"\"\"Extrahiere alle faktischen Aussagen aus dem Text als SPO-Tripel.",
                    "+",
                    "+Regeln:",
                    "+- Subject: Eine benannte Entität (Person, Organisation, Konzept, Methode)",
                    "+- Predicate: Die Beziehung oder Eigenschaft (z.B. \"entwickelte\", \"basiert auf\", \"ist Teil von\")",
                    "+- Object: Eine Entität oder ein Literal-Wert",
                    "+",
                    "+Antworte NUR im JSON-Format:",
                    "+{\"statements\": [",
                    "+  {\"subject\": \"Name der Subject-Entität\", \"predicate\": \"Beziehung\", \"object\": \"Name oder Wert\", \"confidence\": 0.0-1.0}",
                    "+]}",
                    "+",
                    "+Text:",
                    "+{{TEXT}}\"\"\"",
                    "+",
                    "+    prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])",
                    "+",
                    "+    try:",
                    "+        start_time = time.time()",
                    "+        tokens_in, tokens_out = 0, 0",
                    "+        model_name = \"\"",
                    "+",
                    "+        if client:",
                    "+            message = client.messages.create(",
                    "+                model=ANTHROPIC_MODEL,",
                    "+                max_tokens=1500,",
                    "+                messages=[{\"role\": \"user\", \"content\": prompt}],",
                    "+            )",
                    "+            response_text = message.content[0].text",
                    "+            tokens_in = message.usage.input_tokens",
                    "+            tokens_out = message.usage.output_tokens",
                    "+            model_name = ANTHROPIC_MODEL",
                    "+        else:",
                    "+            response = requests.post(",
                    "+                f\"{OLLAMA_HOST}\/api\/generate\",",
                    "+                json={",
                    "+                    \"model\": OLLAMA_CHAT_MODEL,",
                    "+                    \"prompt\": prompt,",
                    "+                    \"stream\": False,",
                    "+                    \"format\": \"json\",",
                    "+                },",
                    "+                timeout=120,",
                    "+            )",
                    "+            response.raise_for_status()",
                    "+            data = response.json()",
                    "+            response_text = data.get(\"response\", \"{}\")",
                    "+            tokens_in = data.get(\"prompt_eval_count\", 0)",
                    "+            tokens_out = data.get(\"eval_count\", 0)",
                    "+            model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"",
                    "+",
                    "+        duration_ms = int((time.time() - start_time) * 1000)",
                    "+",
                    "+        # Log to ki-protokoll",
                    "+        protokoll.log_llm_call(",
                    "+            request=f\"[statement_extraction] chunk_id={chunk_id}\",",
                    "+            response=response_text[:2000],",
                    "+            model_name=model_name,",
                    "+            tokens_input=tokens_in,",
                    "+            tokens_output=tokens_out,",
                    "+            duration_ms=duration_ms,",
                    "+            status=\"completed\",",
                    "+        )",
                    "+",
                    "+        # Parse JSON",
                    "+        import re",
                    "+",
                    "+        json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)",
                    "+        if json_match:",
                    "+            result = json.loads(json_match.group())",
                    "+            return result.get(\"statements\", [])",
                    "+        return []",
                    "+",
                    "+    except Exception as e:",
                    "+        db.log(\"ERROR\", f\"Statement extraction failed for chunk {chunk_id}: {e}\")",
                    "+        protokoll.log_llm_call(",
                    "+            request=f\"[statement_extraction] chunk_id={chunk_id}\",",
                    "+            model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",",
                    "+            status=\"error\",",
                    "+            error_message=str(e),",
                    "+        )",
                    "+        return []",
                    "+",
                    "+",
                    "+def find_entity_by_name(name: str) -> dict | None:",
                    "+    \"\"\"",
                    "+    Find entity by name or canonical_name.",
                    "+",
                    "+    Args:",
                    "+        name: Entity name to search for",
                    "+",
                    "+    Returns:",
                    "+        Entity dict with id, name, canonical_name or None",
                    "+    \"\"\"",
                    "+    name_lower = name.lower().strip()",
                    "+    canonical = name_lower.replace(\" \", \"\").replace(\"-\", \"\")",
                    "+",
                    "+    # Try exact canonical match first",
                    "+    cursor = db.execute(",
                    "+        \"SELECT id, name, canonical_name FROM entities WHERE canonical_name = %s LIMIT 1\",",
                    "+        (canonical,),",
                    "+    )",
                    "+    result = cursor.fetchone()",
                    "+    cursor.close()",
                    "+    if result:",
                    "+        return result",
                    "+",
                    "+    # Try name match (case-insensitive)",
                    "+    cursor = db.execute(",
                    "+        \"SELECT id, name, canonical_name FROM entities WHERE LOWER(name) = %s LIMIT 1\",",
                    "+        (name_lower,),",
                    "+    )",
                    "+    result = cursor.fetchone()",
                    "+    cursor.close()",
                    "+    if result:",
                    "+        return result",
                    "+",
                    "+    # Try partial canonical match",
                    "+    cursor = db.execute(",
                    "+        \"SELECT id, name, canonical_name FROM entities WHERE canonical_name LIKE %s LIMIT 1\",",
                    "+        (f\"%{canonical}%\",),",
                    "+    )",
                    "+    result = cursor.fetchone()",
                    "+    cursor.close()",
                    "+    return result",
                    "+",
                    "+",
                    "+def store_statements(chunk_id: int, statements: list[dict]) -> int:",
                    "+    \"\"\"",
                    "+    Store extracted statements in the database with entity linking.",
                    "+",
                    "+    Args:",
                    "+        chunk_id: ID of the source chunk",
                    "+        statements: List of statement dicts with subject, predicate, object, confidence",
                    "+",
                    "+    Returns:",
                    "+        Number of successfully stored statements",
                    "+    \"\"\"",
                    "+    stored = 0",
                    "+",
                    "+    for stmt in statements:",
                    "+        try:",
                    "+            subject_name = stmt.get(\"subject\", \"\").strip()",
                    "+            predicate = stmt.get(\"predicate\", \"\").strip()",
                    "+            object_value = stmt.get(\"object\", \"\").strip()",
                    "+            confidence = float(stmt.get(\"confidence\", 0.8))",
                    "+",
                    "+            if not subject_name or not predicate:",
                    "+                continue",
                    "+",
                    "+            # Find subject entity",
                    "+            subject_entity = find_entity_by_name(subject_name)",
                    "+            if not subject_entity:",
                    "+                db.log(\"DEBUG\", f\"Subject entity not found: {subject_name}\")",
                    "+                continue  # Subject must be a known entity",
                    "+",
                    "+            subject_entity_id = subject_entity[\"id\"]",
                    "+",
                    "+            # Try to find object entity, otherwise use as literal",
                    "+            object_entity_id = None",
                    "+            object_literal = None",
                    "+",
                    "+            if object_value:",
                    "+                object_entity = find_entity_by_name(object_value)",
                    "+                if object_entity:",
                    "+                    object_entity_id = object_entity[\"id\"]",
                    "+                else:",
                    "+                    object_literal = object_value",
                    "+",
                    "+            # Insert statement",
                    "+            cursor = db.execute(",
                    "+                \"\"\"INSERT INTO statements",
                    "+                   (subject_entity_id, predicate, object_entity_id, object_literal,",
                    "+                    chunk_id, confidence, status, created_at)",
                    "+                   VALUES (%s, %s, %s, %s, %s, %s, 'extracted', NOW())\"\"\",",
                    "+                (",
                    "+                    subject_entity_id,",
                    "+                    predicate[:255],  # Truncate to VARCHAR(255)",
                    "+                    object_entity_id,",
                    "+                    object_literal,",
                    "+                    chunk_id,",
                    "+                    confidence,",
                    "+                ),",
                    "+            )",
                    "+            db.commit()",
                    "+            cursor.close()",
                    "+            stored += 1",
                    "+",
                    "+        except Exception as e:",
                    "+            db.log(\"WARNING\", f\"Failed to store statement: {e}\")",
                    "+",
                    "+    return stored",
                    "+",
                    "+",
                    "+def analyze_chunk_statements(chunk_id: int, content: str, client=None) -> int:",
                    "+    \"\"\"",
                    "+    Extract and store statements for a single chunk.",
                    "+",
                    "+    Args:",
                    "+        chunk_id: ID of the chunk",
                    "+        content: Text content of the chunk",
                    "+        client: Optional Anthropic client",
                    "+",
                    "+    Returns:",
                    "+        Number of statements stored",
                    "+    \"\"\"",
                    "+    statements = extract_statements(chunk_id, content, client)",
                    "+    if statements:",
                    "+        stored = store_statements(chunk_id, statements)",
                    "+        db.log(\"INFO\", f\"Chunk {chunk_id}: {stored}\/{len(statements)} statements stored\")",
                    "+        return stored",
                    "+    return 0",
                    "+",
                    "+",
                    "+def analyze_document_statements(document_id: int, client=None, progress=None) -> int:",
                    "+    \"\"\"",
                    "+    Extract statements from all chunks of a document.",
                    "+",
                    "+    Args:",
                    "+        document_id: ID of the document",
                    "+        client: Optional Anthropic client",
                    "+        progress: Optional PipelineProgress for logging",
                    "+",
                    "+    Returns:",
                    "+        Total number of statements stored",
                    "+    \"\"\"",
                    "+    cursor = db.execute(",
                    "+        \"SELECT id, content FROM chunks WHERE document_id = %s ORDER BY chunk_index\",",
                    "+        (document_id,),",
                    "+    )",
                    "+    chunks = cursor.fetchall()",
                    "+    cursor.close()",
                    "+",
                    "+    total = len(chunks)",
                    "+    db.log(\"INFO\", f\"Extracting statements from {total} chunks\")",
                    "+    if progress:",
                    "+        progress.add_log(f\"Statements: Extrahiere aus {total} Chunks...\")",
                    "+",
                    "+    total_stored = 0",
                    "+    for i, chunk in enumerate(chunks, 1):",
                    "+        if progress:",
                    "+            progress.add_log(f\"Statements: Chunk {i}\/{total}...\")",
                    "+",
                    "+        stored = analyze_chunk_statements(chunk[\"id\"], chunk[\"content\"], client)",
                    "+        total_stored += stored",
                    "+",
                    "+    db.log(\"INFO\", f\"Total statements stored: {total_stored}\")",
                    "+    if progress:",
                    "+        progress.add_log(f\"Statements: {total_stored} Aussagen extrahiert\")",
                    "+",
                    "+    return total_stored",
                    "+",
                    "+",
                    " if __name__ == \"__main__\":",
                    "     test_text = \"\"\"",
                    "     Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie."
                ]
            }
        ],
        "userModified": false,
        "replaceAll": false
    }
}
← Vorheriger Zur Liste Nächster →