Protokoll #18536

ID18536
Zeitstempel2025-12-26 21:27:21.292338
Clientroot
IP145.224.72.140
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens982 (Input: 39, Output: 943)
Dauer85 ms
Request-Zeit2025-12-26 21:27:21.292338
Response-Zeit2025-12-26 21:27:21.377340

Request

{
    "event": "PreToolUse",
    "tool_name": "Read",
    "tool_input": {
        "file_path": "\/var\/www\/scripts\/pipeline\/analyzers\/entity_extractor.py",
        "offset": 1,
        "limit": 120
    }
}

Response

{
    "tool_response": {
        "type": "text",
        "file": {
            "filePath": "\/var\/www\/scripts\/pipeline\/analyzers\/entity_extractor.py",
            "content": "\"\"\"\nEntity Extraction - Extract and store entities from text.\n\"\"\"\n\nimport json\nimport re\nimport sys\nimport time\nimport unicodedata\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom config import ANTHROPIC_MODEL, OLLAMA_HOST\nfrom db import db\nfrom protokoll import protokoll\n\n\ndef normalize_name(name: str) -> str:\n    \"\"\"Generate canonical_name from entity name.\n\n    Rules:\n    - Lowercase\n    - German umlauts: ä→ae, ö→oe, ü→ue, ß→ss\n    - Replace spaces with underscores\n    - Remove special characters except underscores\n    - Collapse multiple underscores\n    \"\"\"\n    if not name:\n        return \"\"\n\n    result = name.lower()\n\n    replacements = {\n        \"ä\": \"ae\", \"ö\": \"oe\", \"ü\": \"ue\", \"ß\": \"ss\",\n        \"Ä\": \"ae\", \"Ö\": \"oe\", \"Ü\": \"ue\",\n    }\n    for old, new in replacements.items():\n        result = result.replace(old, new)\n\n    result = unicodedata.normalize(\"NFKD\", result)\n    result = result.encode(\"ascii\", \"ignore\").decode(\"ascii\")\n    result = re.sub(r\"[\\s\\-]+\", \"_\", result)\n    result = re.sub(r\"[^a-z0-9_]\", \"\", result)\n    result = re.sub(r\"_+\", \"_\", result)\n    result = result.strip(\"_\")\n\n    return result\n\n\n# Category to type mapping for new format\nCATEGORY_TYPE_MAP = {\n    \"persons\": \"PERSON\",\n    \"roles\": \"ROLE\",\n    \"organizations\": \"ORGANIZATION\",\n    \"theories\": \"THEORY\",\n    \"models\": \"MODEL\",\n    \"concepts\": \"CONCEPT\",\n    \"artifacts\": \"ARTIFACT\",\n    \"metaphors\": \"METAPHOR\",\n    \"locations\": \"LOCATION\",\n}\n\n\ndef _validate_entity_in_text(entity_name: str, source_text: str) -> bool:\n    \"\"\"Strictly validate that entity appears EXACTLY in source text.\"\"\"\n    if not entity_name or len(entity_name) < 3:\n        return False\n    # Exact match required\n    return entity_name in source_text\n\n\ndef _normalize_entity_response(result: dict, source_text: str) -> list[dict]:\n    \"\"\"Normalize entity response to standard format with validation.\n\n    Handles two formats:\n    1. New: {\"persons\":[], \"roles\":[], ...}\n    2. Legacy: {\"entities\": [...]}\n\n    Also validates entities against source text to filter hallucinations.\n    \"\"\"\n    entities = []\n\n    # Check for legacy format\n    if \"entities\" in result:\n        legacy_entities = result.get(\"entities\", [])\n        # Validate legacy entities too\n        for e in legacy_entities:\n            if isinstance(e, dict) and \"name\" in e:\n                if _validate_entity_in_text(e[\"name\"], source_text):\n                    entities.append(e)\n        return entities\n\n    # New categorized format\n    for category, items in result.items():\n        if not isinstance(items, list):\n            continue\n\n        entity_type = CATEGORY_TYPE_MAP.get(category, category.upper())\n\n        for item in items:\n            if not item or not isinstance(item, str):\n                continue\n\n            # Strict validation: entity must appear EXACTLY in source text\n            if not _validate_entity_in_text(item, source_text):\n                continue  # Skip hallucinations\n\n            entities.append({\n                \"name\": item,\n                \"type\": entity_type,\n                \"description\": None,\n            })\n\n    return entities\n\n\ndef _call_ollama(prompt: str, model: str, timeout: int = 120) -> tuple[str, int, int, int]:\n    \"\"\"Call Ollama API and return (response_text, tokens_in, tokens_out, duration_ms).\"\"\"",
            "numLines": 120,
            "startLine": 1,
            "totalLines": 444
        }
    }
}
← Vorheriger Zur Liste Nächster →