{
"event": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/backfill_knowledge_semantics.py",
"old_string": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n \"\"\"Store knowledge semantics to database.\"\"\"\n try:\n cursor = db.execute(\n \"\"\"INSERT INTO entity_knowledge_semantics\n (entity_id, semantic_role, properties, functional_category,\n context_meaning, model_used)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n semantic_role = VALUES(semantic_role),\n properties = VALUES(properties),\n functional_category = VALUES(functional_category),\n context_meaning = VALUES(context_meaning),\n model_used = VALUES(model_used),\n updated_at = NOW()\"\"\",\n (\n entity_id,\n semantics.get(\"semantic_role\"),\n json.dumps(semantics.get(\"properties\", {})),\n semantics.get(\"functional_category\"),\n semantics.get(\"context_meaning\"),\n semantics.get(\"model_used\"),\n ),\n )\n db.commit()\n cursor.close()\n return True\n except Exception as e:\n db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n return False",
"new_string": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n \"\"\"Store knowledge semantics to database.\"\"\"\n try:\n cursor = db.execute(\n \"\"\"INSERT INTO entity_semantics\n (entity_id, semantic_role, properties, functional_category,\n definition, model_used)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n semantic_role = VALUES(semantic_role),\n properties = VALUES(properties),\n functional_category = VALUES(functional_category),\n definition = VALUES(definition),\n model_used = VALUES(model_used),\n updated_at = NOW()\"\"\",\n (\n entity_id,\n semantics.get(\"semantic_role\"),\n json.dumps(semantics.get(\"properties\", {})),\n semantics.get(\"functional_category\"),\n semantics.get(\"context_meaning\"),\n semantics.get(\"model_used\"),\n ),\n )\n db.commit()\n cursor.close()\n return True\n except Exception as e:\n db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n return False"
}
}
{
"tool_response": {
"filePath": "\/var\/www\/scripts\/pipeline\/backfill_knowledge_semantics.py",
"oldString": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n \"\"\"Store knowledge semantics to database.\"\"\"\n try:\n cursor = db.execute(\n \"\"\"INSERT INTO entity_knowledge_semantics\n (entity_id, semantic_role, properties, functional_category,\n context_meaning, model_used)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n semantic_role = VALUES(semantic_role),\n properties = VALUES(properties),\n functional_category = VALUES(functional_category),\n context_meaning = VALUES(context_meaning),\n model_used = VALUES(model_used),\n updated_at = NOW()\"\"\",\n (\n entity_id,\n semantics.get(\"semantic_role\"),\n json.dumps(semantics.get(\"properties\", {})),\n semantics.get(\"functional_category\"),\n semantics.get(\"context_meaning\"),\n semantics.get(\"model_used\"),\n ),\n )\n db.commit()\n cursor.close()\n return True\n except Exception as e:\n db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n return False",
"newString": "def store_semantics(entity_id: int, semantics: dict) -> bool:\n \"\"\"Store knowledge semantics to database.\"\"\"\n try:\n cursor = db.execute(\n \"\"\"INSERT INTO entity_semantics\n (entity_id, semantic_role, properties, functional_category,\n definition, model_used)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n semantic_role = VALUES(semantic_role),\n properties = VALUES(properties),\n functional_category = VALUES(functional_category),\n definition = VALUES(definition),\n model_used = VALUES(model_used),\n updated_at = NOW()\"\"\",\n (\n entity_id,\n semantics.get(\"semantic_role\"),\n json.dumps(semantics.get(\"properties\", {})),\n semantics.get(\"functional_category\"),\n semantics.get(\"context_meaning\"),\n semantics.get(\"model_used\"),\n ),\n )\n db.commit()\n cursor.close()\n return True\n except Exception as e:\n db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n return False",
"originalFile": "#!\/usr\/bin\/env python3\n\"\"\"\nBackfill Knowledge Semantics for existing entities.\n\nAnalyzes all entities that don't have knowledge_semantics yet.\nResume-capable: skips already analyzed entities.\n\nUsage:\n python backfill_knowledge_semantics.py # Process all pending\n python backfill_knowledge_semantics.py --limit 100 # Process max 100\n python backfill_knowledge_semantics.py --batch 25 # Batch size 25\n python backfill_knowledge_semantics.py --dry-run # Just count, don't process\n\"\"\"\n\nimport argparse\nimport json\nimport time\n\nimport ollama\n\nfrom db import db\nfrom json_utils import extract_json\n\n# Pipeline-ID für Wissenschaftliche Pipeline\nDEFAULT_PIPELINE_ID = 5\n\n\ndef get_pipeline_model(step_type: str, pipeline_id: int = DEFAULT_PIPELINE_ID) -> str:\n \"\"\"Get model from pipeline_steps config - NO HARDCODED DEFAULTS.\"\"\"\n cursor = db.execute(\n \"\"\"SELECT config FROM pipeline_steps\n WHERE pipeline_id = %s AND step_type = %s AND enabled = 1\n LIMIT 1\"\"\",\n (pipeline_id, step_type),\n )\n row = cursor.fetchone()\n cursor.close()\n\n if row and row.get(\"config\"):\n try:\n config = json.loads(row[\"config\"])\n model = config.get(\"model\")\n if model:\n return model\n except json.JSONDecodeError:\n pass\n\n raise ValueError(f\"No model configured for step_type={step_type} in pipeline {pipeline_id}\")\n\n\n# Valid ENUM values for validation\nVALID_SEMANTIC_ROLES = {\"agent\", \"patient\", \"instrument\", \"location\", \"cause\", \"effect\"}\nVALID_FUNCTIONAL_CATEGORIES = {\"method\", \"tool\", \"concept\", \"actor\", \"outcome\", \"process\"}\n\nPROMPT_TEMPLATE = \"\"\"Analysiere die Bedeutung dieser Entität im Kontext.\n\nEntität: {name}\nTyp: {entity_type}\nKontext: {context}\n\nBestimme:\n1. semantic_role: Welche Rolle spielt die Entität? agent, patient, instrument, location, cause, effect\n2. properties: Welche Eigenschaften hat sie? (als JSON-Objekt)\n3. functional_category: Welche Funktion? method, tool, concept, actor, outcome, process\n4. context_meaning: Was bedeutet die Entität in diesem Kontext? (1 Satz)\n\nAntworte NUR mit gültigem JSON:\n{{\n \"semantic_role\": \"agent|patient|instrument|location|cause|effect\",\n \"properties\": {{\"key\": \"value\"}},\n \"functional_category\": \"method|tool|concept|actor|outcome|process\",\n \"context_meaning\": \"Kurze Erklärung\"\n}}\"\"\"\n\n\ndef validate_and_fix(data: dict) -> dict:\n \"\"\"Validate and fix ENUM values from LLM response.\"\"\"\n # semantic_role - handle list\/string\n role = data.get(\"semantic_role\", \"\")\n if isinstance(role, list):\n role = role[0] if role else \"\"\n role = str(role).lower().strip()\n if role not in VALID_SEMANTIC_ROLES:\n if \"agent\" in role or \"akteur\" in role or \"handelnde\" in role:\n role = \"agent\"\n elif \"patient\" in role or \"betroffene\" in role:\n role = \"patient\"\n elif \"instrument\" in role or \"werkzeug\" in role or \"mittel\" in role:\n role = \"instrument\"\n elif \"ort\" in role or \"location\" in role or \"ort\" in role:\n role = \"location\"\n elif \"ursache\" in role or \"cause\" in role:\n role = \"cause\"\n elif \"wirkung\" in role or \"effect\" in role or \"ergebnis\" in role:\n role = \"effect\"\n else:\n role = \"instrument\" # Default\n data[\"semantic_role\"] = role\n\n # functional_category - handle list\/string\n cat = data.get(\"functional_category\", \"\")\n if isinstance(cat, list):\n cat = cat[0] if cat else \"\"\n cat = str(cat).lower().strip()\n if cat not in VALID_FUNCTIONAL_CATEGORIES:\n if \"method\" in cat or \"methode\" in cat or \"verfahren\" in cat:\n cat = \"method\"\n elif \"tool\" in cat or \"werkzeug\" in cat:\n cat = \"tool\"\n elif \"concept\" in cat or \"konzept\" in cat or \"begriff\" in cat:\n cat = \"concept\"\n elif \"actor\" in cat or \"akteur\" in cat:\n cat = \"actor\"\n elif \"outcome\" in cat or \"ergebnis\" in cat or \"resultat\" in cat:\n cat = \"outcome\"\n elif \"process\" in cat or \"prozess\" in cat or \"ablauf\" in cat:\n cat = \"process\"\n else:\n cat = \"concept\" # Default\n data[\"functional_category\"] = cat\n\n # properties - ensure it's a dict\n props = data.get(\"properties\", {})\n if not isinstance(props, dict):\n props = {}\n data[\"properties\"] = props\n\n # context_meaning - ensure it's a string\n meaning = data.get(\"context_meaning\", \"\")\n if not isinstance(meaning, str):\n meaning = str(meaning) if meaning else \"\"\n # Truncate if too long\n if len(meaning) > 500:\n meaning = meaning[:497] + \"...\"\n data[\"context_meaning\"] = meaning\n\n return data\n\n\ndef get_pending_entities(limit: int = 0) -> list:\n \"\"\"Get entities without knowledge semantics, with context from chunks.\"\"\"\n sql = \"\"\"\n SELECT e.id, e.name, e.type,\n GROUP_CONCAT(SUBSTRING(c.content, 1, 500) SEPARATOR ' ... ') as context\n FROM entities e\n LEFT JOIN chunk_entities ce ON e.id = ce.entity_id\n LEFT JOIN chunks c ON ce.chunk_id = c.id\n LEFT JOIN entity_semantics es ON e.id = es.entity_id\n WHERE es.id IS NULL\n GROUP BY e.id, e.name, e.type\n ORDER BY e.id\n \"\"\"\n if limit > 0:\n sql = sql.replace(\"ORDER BY e.id\", f\"ORDER BY e.id LIMIT {limit}\")\n\n cursor = db.execute(sql)\n entities = cursor.fetchall()\n cursor.close()\n return list(entities)\n\n\ndef analyze_entity(entity: dict, model: str) -> dict | None:\n \"\"\"Analyze a single entity with Ollama.\"\"\"\n try:\n context = entity.get(\"context\") or \"\"\n if len(context) > 1500:\n context = context[:1500]\n\n prompt = PROMPT_TEMPLATE.format(\n name=entity.get(\"name\", \"\"),\n entity_type=entity.get(\"type\", \"unknown\"),\n context=context,\n )\n\n response = ollama.generate(\n model=model,\n prompt=prompt,\n options={\"num_predict\": 300},\n )\n\n response_text = response[\"response\"].strip()\n\n # Robuste JSON-Extraktion\n data = extract_json(response_text)\n if data:\n data = validate_and_fix(data)\n data[\"model_used\"] = model\n return data\n\n except Exception as e:\n db.log(\"WARNING\", f\"Backfill: Knowledge semantic analysis failed for entity {entity['id']}: {e}\")\n\n return None\n\n\ndef store_semantics(entity_id: int, semantics: dict) -> bool:\n \"\"\"Store knowledge semantics to database.\"\"\"\n try:\n cursor = db.execute(\n \"\"\"INSERT INTO entity_knowledge_semantics\n (entity_id, semantic_role, properties, functional_category,\n context_meaning, model_used)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n semantic_role = VALUES(semantic_role),\n properties = VALUES(properties),\n functional_category = VALUES(functional_category),\n context_meaning = VALUES(context_meaning),\n model_used = VALUES(model_used),\n updated_at = NOW()\"\"\",\n (\n entity_id,\n semantics.get(\"semantic_role\"),\n json.dumps(semantics.get(\"properties\", {})),\n semantics.get(\"functional_category\"),\n semantics.get(\"context_meaning\"),\n semantics.get(\"model_used\"),\n ),\n )\n db.commit()\n cursor.close()\n return True\n except Exception as e:\n db.log(\"ERROR\", f\"Backfill: Failed to store knowledge semantics for entity {entity_id}: {e}\")\n return False\n\n\ndef main():\n \"\"\"Main entry point.\"\"\"\n parser = argparse.ArgumentParser(description=\"Backfill Knowledge Semantics\")\n parser.add_argument(\"--limit\", type=int, default=0, help=\"Max entities to process (0=all)\")\n parser.add_argument(\"--batch\", type=int, default=25, help=\"Batch size for progress output\")\n parser.add_argument(\"--model\", default=None, help=\"Override pipeline model (reads from pipeline_steps if not set)\")\n parser.add_argument(\"--pipeline-id\", type=int, default=DEFAULT_PIPELINE_ID, help=\"Pipeline ID to read config from\")\n parser.add_argument(\"--dry-run\", action=\"store_true\", help=\"Just count, don't process\")\n\n args = parser.parse_args()\n\n db.connect()\n\n try:\n # Get model from pipeline config if not overridden\n if args.model is None:\n args.model = get_pipeline_model(\"knowledge_semantic_analyze\", args.pipeline_id)\n print(f\"[Config] Model from pipeline {args.pipeline_id}: {args.model}\")\n\n # Get pending entities\n entities = get_pending_entities(args.limit)\n total = len(entities)\n\n print(\"Knowledge Semantics Backfill\")\n print(\"=\" * 50)\n print(f\"Pending entities: {total}\")\n print(f\"Model: {args.model} (from pipeline_steps)\")\n print(f\"Batch size: {args.batch}\")\n\n if args.dry_run:\n print(\"\\nDry run - no processing\")\n return\n\n if total == 0:\n print(\"\\nNo pending entities - all done!\")\n return\n\n print(\"\\nStarting analysis...\")\n print(\"-\" * 50)\n\n success = 0\n errors = 0\n start_time = time.time()\n\n for i, entity in enumerate(entities, 1):\n # Analyze\n semantics = analyze_entity(entity, args.model)\n\n if semantics:\n if store_semantics(entity[\"id\"], semantics):\n success += 1\n else:\n errors += 1\n else:\n errors += 1\n\n # Progress output\n if i % args.batch == 0 or i == total:\n elapsed = time.time() - start_time\n rate = i \/ elapsed if elapsed > 0 else 0\n eta = (total - i) \/ rate if rate > 0 else 0\n print(f\" [{i}\/{total}] Success: {s... [TRUNCATED-94e80264c71b39ba]",
"structuredPatch": [
{
"oldStart": 197,
"oldLines": 15,
"newStart": 197,
"newLines": 15,
"lines": [
" \"\"\"Store knowledge semantics to database.\"\"\"",
" try:",
" cursor = db.execute(",
"- \"\"\"INSERT INTO entity_knowledge_semantics",
"+ \"\"\"INSERT INTO entity_semantics",
" (entity_id, semantic_role, properties, functional_category,",
"- context_meaning, model_used)",
"+ definition, model_used)",
" VALUES (%s, %s, %s, %s, %s, %s)",
" ON DUPLICATE KEY UPDATE",
" semantic_role = VALUES(semantic_role),",
" properties = VALUES(properties),",
" functional_category = VALUES(functional_category),",
"- context_meaning = VALUES(context_meaning),",
"+ definition = VALUES(definition),",
" model_used = VALUES(model_used),",
" updated_at = NOW()\"\"\",",
" ("
]
}
],
"userModified": false,
"replaceAll": false
}
}