Protokoll #30191

ID30191
Zeitstempel2025-12-30 11:59:08.383011
Clientroot
IP145.224.72.245
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens4,299 (Input: 692, Output: 3,607)
Dauer78 ms
Request-Zeit2025-12-30 11:59:08.383011
Response-Zeit2025-12-30 11:59:08.460719

Request

{
    "event": "PreToolUse",
    "tool_name": "Edit",
    "tool_input": {
        "file_path": "\/var\/www\/scripts\/pipeline\/backfill_knowledge_semantics.py",
        "old_string": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n    \"\"\"Store knowledge semantics to database.\"\"\"\n    try:\n        cursor = db.execute(\n            \"\"\"INSERT INTO entity_knowledge_semantics\n               (entity_id, semantic_role, properties, functional_category,\n                context_meaning, model_used)\n               VALUES (%s, %s, %s, %s, %s, %s)\n               ON DUPLICATE KEY UPDATE\n               semantic_role = VALUES(semantic_role),\n               properties = VALUES(properties),\n               functional_category = VALUES(functional_category),\n               context_meaning = VALUES(context_meaning),\n               model_used = VALUES(model_used),\n               updated_at = NOW()\"\"\",\n            (\n                entity_id,\n                semantics.get(\"semantic_role\"),\n                json.dumps(semantics.get(\"properties\", {})),\n                semantics.get(\"functional_category\"),\n                semantics.get(\"context_meaning\"),\n                semantics.get(\"model_used\"),\n            ),\n        )\n        db.commit()\n        cursor.close()\n        return True\n    except Exception as e:\n        db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n        return False",
        "new_string": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n    \"\"\"Store knowledge semantics to database.\"\"\"\n    try:\n        cursor = db.execute(\n            \"\"\"INSERT INTO entity_semantics\n               (entity_id, semantic_role, properties, functional_category,\n                definition, model_used)\n               VALUES (%s, %s, %s, %s, %s, %s)\n               ON DUPLICATE KEY UPDATE\n               semantic_role = VALUES(semantic_role),\n               properties = VALUES(properties),\n               functional_category = VALUES(functional_category),\n               definition = VALUES(definition),\n               model_used = VALUES(model_used),\n               updated_at = NOW()\"\"\",\n            (\n                entity_id,\n                semantics.get(\"semantic_role\"),\n                json.dumps(semantics.get(\"properties\", {})),\n                semantics.get(\"functional_category\"),\n                semantics.get(\"context_meaning\"),\n                semantics.get(\"model_used\"),\n            ),\n        )\n        db.commit()\n        cursor.close()\n        return True\n    except Exception as e:\n        db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n        return False"
    }
}

Response

{
    "tool_response": {
        "filePath": "\/var\/www\/scripts\/pipeline\/backfill_knowledge_semantics.py",
        "oldString": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n    \"\"\"Store knowledge semantics to database.\"\"\"\n    try:\n        cursor = db.execute(\n            \"\"\"INSERT INTO entity_knowledge_semantics\n               (entity_id, semantic_role, properties, functional_category,\n                context_meaning, model_used)\n               VALUES (%s, %s, %s, %s, %s, %s)\n               ON DUPLICATE KEY UPDATE\n               semantic_role = VALUES(semantic_role),\n               properties = VALUES(properties),\n               functional_category = VALUES(functional_category),\n               context_meaning = VALUES(context_meaning),\n               model_used = VALUES(model_used),\n               updated_at = NOW()\"\"\",\n            (\n                entity_id,\n                semantics.get(\"semantic_role\"),\n                json.dumps(semantics.get(\"properties\", {})),\n                semantics.get(\"functional_category\"),\n                semantics.get(\"context_meaning\"),\n                semantics.get(\"model_used\"),\n            ),\n        )\n        db.commit()\n        cursor.close()\n        return True\n    except Exception as e:\n        db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n        return False",
        "newString": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n    \"\"\"Store knowledge semantics to database.\"\"\"\n    try:\n        cursor = db.execute(\n            \"\"\"INSERT INTO entity_semantics\n               (entity_id, semantic_role, properties, functional_category,\n                definition, model_used)\n               VALUES (%s, %s, %s, %s, %s, %s)\n               ON DUPLICATE KEY UPDATE\n               semantic_role = VALUES(semantic_role),\n               properties = VALUES(properties),\n               functional_category = VALUES(functional_category),\n               definition = VALUES(definition),\n               model_used = VALUES(model_used),\n               updated_at = NOW()\"\"\",\n            (\n                entity_id,\n                semantics.get(\"semantic_role\"),\n                json.dumps(semantics.get(\"properties\", {})),\n                semantics.get(\"functional_category\"),\n                semantics.get(\"context_meaning\"),\n                semantics.get(\"model_used\"),\n            ),\n        )\n        db.commit()\n        cursor.close()\n        return True\n    except Exception as e:\n        db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n        return False",
        "originalFile": "#!\/usr\/bin\/env python3\n\"\"\"\nBackfill Knowledge Semantics for existing entities.\n\nAnalyzes all entities that don't have knowledge_semantics yet.\nResume-capable: skips already analyzed entities.\n\nUsage:\n    python backfill_knowledge_semantics.py              # Process all pending\n    python backfill_knowledge_semantics.py --limit 100  # Process max 100\n    python backfill_knowledge_semantics.py --batch 25   # Batch size 25\n    python backfill_knowledge_semantics.py --dry-run    # Just count, don't process\n\"\"\"\n\nimport argparse\nimport json\nimport time\n\nimport ollama\n\nfrom db import db\nfrom json_utils import extract_json\n\n# Pipeline-ID für Wissenschaftliche Pipeline\nDEFAULT_PIPELINE_ID = 5\n\n\ndef get_pipeline_model(step_type: str, pipeline_id: int = DEFAULT_PIPELINE_ID) -> str:\n    \"\"\"Get model from pipeline_steps config - NO HARDCODED DEFAULTS.\"\"\"\n    cursor = db.execute(\n        \"\"\"SELECT config FROM pipeline_steps\n           WHERE pipeline_id = %s AND step_type = %s AND enabled = 1\n           LIMIT 1\"\"\",\n        (pipeline_id, step_type),\n    )\n    row = cursor.fetchone()\n    cursor.close()\n\n    if row and row.get(\"config\"):\n        try:\n            config = json.loads(row[\"config\"])\n            model = config.get(\"model\")\n            if model:\n                return model\n        except json.JSONDecodeError:\n            pass\n\n    raise ValueError(f\"No model configured for step_type={step_type} in pipeline {pipeline_id}\")\n\n\n# Valid ENUM values for validation\nVALID_SEMANTIC_ROLES = {\"agent\", \"patient\", \"instrument\", \"location\", \"cause\", \"effect\"}\nVALID_FUNCTIONAL_CATEGORIES = {\"method\", \"tool\", \"concept\", \"actor\", \"outcome\", \"process\"}\n\nPROMPT_TEMPLATE = \"\"\"Analysiere die Bedeutung dieser Entität im Kontext.\n\nEntität: {name}\nTyp: {entity_type}\nKontext: {context}\n\nBestimme:\n1. semantic_role: Welche Rolle spielt die Entität? agent, patient, instrument, location, cause, effect\n2. properties: Welche Eigenschaften hat sie? (als JSON-Objekt)\n3. functional_category: Welche Funktion? method, tool, concept, actor, outcome, process\n4. context_meaning: Was bedeutet die Entität in diesem Kontext? (1 Satz)\n\nAntworte NUR mit gültigem JSON:\n{{\n  \"semantic_role\": \"agent|patient|instrument|location|cause|effect\",\n  \"properties\": {{\"key\": \"value\"}},\n  \"functional_category\": \"method|tool|concept|actor|outcome|process\",\n  \"context_meaning\": \"Kurze Erklärung\"\n}}\"\"\"\n\n\ndef validate_and_fix(data: dict) -> dict:\n    \"\"\"Validate and fix ENUM values from LLM response.\"\"\"\n    # semantic_role - handle list\/string\n    role = data.get(\"semantic_role\", \"\")\n    if isinstance(role, list):\n        role = role[0] if role else \"\"\n    role = str(role).lower().strip()\n    if role not in VALID_SEMANTIC_ROLES:\n        if \"agent\" in role or \"akteur\" in role or \"handelnde\" in role:\n            role = \"agent\"\n        elif \"patient\" in role or \"betroffene\" in role:\n            role = \"patient\"\n        elif \"instrument\" in role or \"werkzeug\" in role or \"mittel\" in role:\n            role = \"instrument\"\n        elif \"ort\" in role or \"location\" in role or \"ort\" in role:\n            role = \"location\"\n        elif \"ursache\" in role or \"cause\" in role:\n            role = \"cause\"\n        elif \"wirkung\" in role or \"effect\" in role or \"ergebnis\" in role:\n            role = \"effect\"\n        else:\n            role = \"instrument\"  # Default\n    data[\"semantic_role\"] = role\n\n    # functional_category - handle list\/string\n    cat = data.get(\"functional_category\", \"\")\n    if isinstance(cat, list):\n        cat = cat[0] if cat else \"\"\n    cat = str(cat).lower().strip()\n    if cat not in VALID_FUNCTIONAL_CATEGORIES:\n        if \"method\" in cat or \"methode\" in cat or \"verfahren\" in cat:\n            cat = \"method\"\n        elif \"tool\" in cat or \"werkzeug\" in cat:\n            cat = \"tool\"\n        elif \"concept\" in cat or \"konzept\" in cat or \"begriff\" in cat:\n            cat = \"concept\"\n        elif \"actor\" in cat or \"akteur\" in cat:\n            cat = \"actor\"\n        elif \"outcome\" in cat or \"ergebnis\" in cat or \"resultat\" in cat:\n            cat = \"outcome\"\n        elif \"process\" in cat or \"prozess\" in cat or \"ablauf\" in cat:\n            cat = \"process\"\n        else:\n            cat = \"concept\"  # Default\n    data[\"functional_category\"] = cat\n\n    # properties - ensure it's a dict\n    props = data.get(\"properties\", {})\n    if not isinstance(props, dict):\n        props = {}\n    data[\"properties\"] = props\n\n    # context_meaning - ensure it's a string\n    meaning = data.get(\"context_meaning\", \"\")\n    if not isinstance(meaning, str):\n        meaning = str(meaning) if meaning else \"\"\n    # Truncate if too long\n    if len(meaning) > 500:\n        meaning = meaning[:497] + \"...\"\n    data[\"context_meaning\"] = meaning\n\n    return data\n\n\ndef get_pending_entities(limit: int = 0) -> list:\n    \"\"\"Get entities without knowledge semantics, with context from chunks.\"\"\"\n    sql = \"\"\"\n        SELECT e.id, e.name, e.type,\n               GROUP_CONCAT(SUBSTRING(c.content, 1, 500) SEPARATOR ' ... ') as context\n        FROM entities e\n        LEFT JOIN chunk_entities ce ON e.id = ce.entity_id\n        LEFT JOIN chunks c ON ce.chunk_id = c.id\n        LEFT JOIN entity_semantics es ON e.id = es.entity_id\n        WHERE es.id IS NULL\n        GROUP BY e.id, e.name, e.type\n        ORDER BY e.id\n    \"\"\"\n    if limit > 0:\n        sql = sql.replace(\"ORDER BY e.id\", f\"ORDER BY e.id LIMIT {limit}\")\n\n    cursor = db.execute(sql)\n    entities = cursor.fetchall()\n    cursor.close()\n    return list(entities)\n\n\ndef analyze_entity(entity: dict, model: str) -> dict | None:\n    \"\"\"Analyze a single entity with Ollama.\"\"\"\n    try:\n        context = entity.get(\"context\") or \"\"\n        if len(context) > 1500:\n            context = context[:1500]\n\n        prompt = PROMPT_TEMPLATE.format(\n            name=entity.get(\"name\", \"\"),\n            entity_type=entity.get(\"type\", \"unknown\"),\n            context=context,\n        )\n\n        response = ollama.generate(\n            model=model,\n            prompt=prompt,\n            options={\"num_predict\": 300},\n        )\n\n        response_text = response[\"response\"].strip()\n\n        # Robuste JSON-Extraktion\n        data = extract_json(response_text)\n        if data:\n            data = validate_and_fix(data)\n            data[\"model_used\"] = model\n            return data\n\n    except Exception as e:\n        db.log(\"WARNING\", f\"Backfill: Knowledge semantic analysis failed for entity {entity['id']}: {e}\")\n\n    return None\n\n\ndef store_semantics(entity_id: int, semantics: dict) -> bool:\n    \"\"\"Store knowledge semantics to database.\"\"\"\n    try:\n        cursor = db.execute(\n            \"\"\"INSERT INTO entity_knowledge_semantics\n               (entity_id, semantic_role, properties, functional_category,\n                context_meaning, model_used)\n               VALUES (%s, %s, %s, %s, %s, %s)\n               ON DUPLICATE KEY UPDATE\n               semantic_role = VALUES(semantic_role),\n               properties = VALUES(properties),\n               functional_category = VALUES(functional_category),\n               context_meaning = VALUES(context_meaning),\n               model_used = VALUES(model_used),\n               updated_at = NOW()\"\"\",\n            (\n                entity_id,\n                semantics.get(\"semantic_role\"),\n                json.dumps(semantics.get(\"properties\", {})),\n                semantics.get(\"functional_category\"),\n                semantics.get(\"context_meaning\"),\n                semantics.get(\"model_used\"),\n            ),\n        )\n        db.commit()\n        cursor.close()\n        return True\n    except Exception as e:\n        db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n        return False\n\n\ndef main():\n    \"\"\"Main entry point.\"\"\"\n    parser = argparse.ArgumentParser(description=\"Backfill Knowledge Semantics\")\n    parser.add_argument(\"--limit\", type=int, default=0, help=\"Max entities to process (0=all)\")\n    parser.add_argument(\"--batch\", type=int, default=25, help=\"Batch size for progress output\")\n    parser.add_argument(\"--model\", default=None, help=\"Override pipeline model (reads from pipeline_steps if not set)\")\n    parser.add_argument(\"--pipeline-id\", type=int, default=DEFAULT_PIPELINE_ID, help=\"Pipeline ID to read config from\")\n    parser.add_argument(\"--dry-run\", action=\"store_true\", help=\"Just count, don't process\")\n\n    args = parser.parse_args()\n\n    db.connect()\n\n    try:\n        # Get model from pipeline config if not overridden\n        if args.model is None:\n            args.model = get_pipeline_model(\"knowledge_semantic_analyze\", args.pipeline_id)\n            print(f\"[Config] Model from pipeline {args.pipeline_id}: {args.model}\")\n\n        # Get pending entities\n        entities = get_pending_entities(args.limit)\n        total = len(entities)\n\n        print(\"Knowledge Semantics Backfill\")\n        print(\"=\" * 50)\n        print(f\"Pending entities: {total}\")\n        print(f\"Model: {args.model} (from pipeline_steps)\")\n        print(f\"Batch size: {args.batch}\")\n\n        if args.dry_run:\n            print(\"\\nDry run - no processing\")\n            return\n\n        if total == 0:\n            print(\"\\nNo pending entities - all done!\")\n            return\n\n        print(\"\\nStarting analysis...\")\n        print(\"-\" * 50)\n\n        success = 0\n        errors = 0\n        start_time = time.time()\n\n        for i, entity in enumerate(entities, 1):\n            # Analyze\n            semantics = analyze_entity(entity, args.model)\n\n            if semantics:\n                if store_semantics(entity[\"id\"], semantics):\n                    success += 1\n                else:\n                    errors += 1\n            else:\n                errors += 1\n\n            # Progress output\n            if i % args.batch == 0 or i == total:\n                elapsed = time.time() - start_time\n                rate = i \/ elapsed if elapsed > 0 else 0\n                eta = (total - i) \/ rate if rate > 0 else 0\n                print(f\"  [{i}\/{total}] Success: {s... [TRUNCATED-94e80264c71b39ba]",
        "structuredPatch": [
            {
                "oldStart": 197,
                "oldLines": 15,
                "newStart": 197,
                "newLines": 15,
                "lines": [
                    "     \"\"\"Store knowledge semantics to database.\"\"\"",
                    "     try:",
                    "         cursor = db.execute(",
                    "-            \"\"\"INSERT INTO entity_knowledge_semantics",
                    "+            \"\"\"INSERT INTO entity_semantics",
                    "                (entity_id, semantic_role, properties, functional_category,",
                    "-                context_meaning, model_used)",
                    "+                definition, model_used)",
                    "                VALUES (%s, %s, %s, %s, %s, %s)",
                    "                ON DUPLICATE KEY UPDATE",
                    "                semantic_role = VALUES(semantic_role),",
                    "                properties = VALUES(properties),",
                    "                functional_category = VALUES(functional_category),",
                    "-               context_meaning = VALUES(context_meaning),",
                    "+               definition = VALUES(definition),",
                    "                model_used = VALUES(model_used),",
                    "                updated_at = NOW()\"\"\",",
                    "             ("
                ]
            }
        ],
        "userModified": false,
        "replaceAll": false
    }
}
← Vorheriger Zur Liste Nächster →