{
"tool_response": {
"type": "text",
"file": {
"filePath": "\/opt\/scripts\/pipeline\/generate_semantics.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"\nGenerate semantic definitions for entities using Ollama.\nUses document context to create meaningful definitions.\n\"\"\"\n\nimport json\nimport time\n\nimport requests\n\n# Configuration\nOLLAMA_HOST = \"http:\/\/localhost:11434\"\nMODEL = \"gpt-oss:20b\"\nDB_CONFIG = {\n \"host\": \"localhost\",\n \"database\": \"ki_system\",\n \"user\": \"root\",\n}\n\n\ndef get_db_password():\n \"\"\"Read database password from credentials file.\"\"\"\n cred_file = \"\/var\/www\/docs\/credentials\/credentials.md\"\n with open(cred_file) as f:\n for line in f:\n if \"MariaDB\" in line and \"root\" in line:\n parts = line.split(\"|\")\n if len(parts) >= 4:\n return parts[3].strip()\n return \"\"\n\n\ndef get_db_connection():\n \"\"\"Create database connection.\"\"\"\n import mysql.connector\n\n return mysql.connector.connect(\n host=DB_CONFIG[\"host\"],\n database=DB_CONFIG[\"database\"],\n user=DB_CONFIG[\"user\"],\n password=get_db_password(),\n )\n\n\ndef get_entities(conn):\n \"\"\"Get all entities from database.\"\"\"\n cursor = conn.cursor(dictionary=True)\n cursor.execute(\"\"\"\n SELECT e.id, e.name, e.type, e.description\n FROM entities e\n LEFT JOIN entity_semantics es ON e.id = es.entity_id\n WHERE es.id IS NULL\n ORDER BY e.id\n \"\"\")\n return cursor.fetchall()\n\n\ndef get_document_context(conn, limit=5):\n \"\"\"Get document chunks for context.\"\"\"\n cursor = conn.cursor(dictionary=True)\n cursor.execute(\n \"\"\"\n SELECT content FROM chunks\n ORDER BY chunk_index\n LIMIT %s\n \"\"\",\n (limit,),\n )\n chunks = cursor.fetchall()\n return \"\\n\\n\".join([c[\"content\"] for c in chunks])\n\n\ndef extract_json_from_text(text):\n \"\"\"Extract JSON object from text response.\"\"\"\n import re\n\n # Try to find JSON object in the text\n # Look for { ... } pattern\n match = re.search(r\"\\{[^{}]*(?:\\{[^{}]*\\}[^{}]*)*\\}\", text, re.DOTALL)\n if match:\n try:\n return json.loads(match.group())\n except json.JSONDecodeError:\n pass\n\n # Try parsing the whole text as JSON\n try:\n return json.loads(text)\n except json.JSONDecodeError:\n pass\n\n return None\n\n\ndef generate_semantic(entity, context, model=MODEL):\n \"\"\"Generate semantic definition using Ollama.\"\"\"\n prompt = f\"\"\"Analysiere diese Entity und erstelle eine semantische Definition.\n\nEntity: {entity[\"name\"]} (Typ: {entity[\"type\"]})\n\nKontext aus Dokument:\n{context[:3000]}\n\nAntworte mit einem JSON-Objekt:\n{{\"definition\": \"Bedeutung in 1-2 Sätzen\", \"domain\": \"Wissensdomäne\", \"context\": \"Verwendungskontext\", \"attributes\": {{}}, \"usage_notes\": \"\", \"confidence\": 0.8}}\n\nNur das JSON-Objekt ausgeben, nichts anderes.\"\"\"\n\n try:\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\n \"model\": model,\n \"prompt\": prompt,\n \"stream\": False,\n \"options\": {\"temperature\": 0.3, \"num_predict\": 800},\n },\n timeout=180,\n )\n response.raise_for_status()\n data = response.json()\n\n response_text = data.get(\"response\", \"\")\n\n # Debug output\n if not response_text:\n print(\" Empty response from Ollama\")\n return None\n\n # Try to extract JSON from the response\n result = extract_json_from_text(response_text)\n\n if not result:\n # If JSON extraction failed, try to create a basic semantic from the text\n print(f\" Could not parse JSON, raw: {response_text[:100]}...\")\n return None\n\n return result\n except requests.exceptions.Timeout:\n print(\" Timeout after 180s\")\n return None\n except Exception as e:\n print(f\" Error: {e}\")\n return None\n\n\ndef save_semantic(conn, entity_id, semantic):\n \"\"\"Save semantic to database.\"\"\"\n cursor = conn.cursor()\n cursor.execute(\n \"\"\"\n INSERT INTO entity_semantics\n (entity_id, definition, domain, context, attributes, usage_notes, confidence, source)\n VALUES\n (%s, %s, %s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n definition = VALUES(definition),\n domain = VALUES(domain),\n context = VALUES(context),\n attributes = VALUES(attributes),\n usage_notes = VALUES(usage_notes),\n confidence = VALUES(confidence),\n source = VALUES(source),\n updated_at = NOW()\n \"\"\",\n (\n entity_id,\n semantic.get(\"definition\"),\n semantic.get(\"domain\"),\n semantic.get(\"context\"),\n json.dumps(semantic.get(\"attributes\", {}), ensure_ascii=False),\n semantic.get(\"usage_notes\"),\n float(semantic.get(\"confidence\", 0.5)),\n \"ollama\",\n ),\n )\n conn.commit()\n\n\ndef main():\n \"\"\"Main function.\"\"\"\n print(f\"Semantic Generation with {MODEL}\")\n print(\"=\" * 50)\n\n conn = get_db_connection()\n\n # Get entities without semantics\n entities = get_entities(conn)\n print(f\"Entities without semantics: {len(entities)}\")\n\n if not entities:\n print(\"All entities already have semantics.\")\n return\n\n # Get document context\n context = get_document_context(conn, limit=5)\n print(f\"Context loaded: {len(context)} chars\")\n print()\n\n # Process each entity\n success = 0\n failed = 0\n\n for i, entity in enumerate(entities, 1):\n print(f\"[{i}\/{len(entities)}] {entity['name']} ({entity['type']})\")\n\n semantic = generate_semantic(entity, context)\n\n if semantic and \"definition\" in semantic:\n save_semantic(conn, entity[\"id\"], semantic)\n print(f\" -> {semantic.get('domain', 'N\/A')}: {semantic.get('definition', '')[:60]}...\")\n success += 1\n else:\n print(\" -> FAILED\")\n failed += 1\n\n # Small delay to avoid overwhelming Ollama\n time.sleep(0.5)\n\n print()\n print(\"=\" * 50)\n print(f\"Done! Success: {success}, Failed: {failed}\")\n\n conn.close()\n\n\nif __name__ == \"__main__\":\n main()\n",
"numLines": 230,
"startLine": 1,
"totalLines": 230
}
}
}