generate_semantics.py
- Pfad:
/var/www/scripts/pipeline/generate_semantics.py
- Namespace: pipeline
- Zeilen: 239 | Größe: 6,442 Bytes
- Geändert: 2025-12-28 12:47:24 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 88
- Dependencies: 70 (25%)
- LOC: 87 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 80 (10%)
Issues 2
| Zeile |
Typ |
Beschreibung |
| 142 |
magic_number |
Magic Number gefunden: 100 |
| 221 |
magic_number |
Magic Number gefunden: 60 |
Dependencies 8
- use json
- use os
- use sys
- use time
- use requests
- use pipeline_config.get_step_model
- use mysql.connector
- use re
Funktionen 8
-
get_db_password()
Zeile 28
-
get_db_connection()
Zeile 40
-
get_entities()
Zeile 52
-
get_document_context()
Zeile 65
-
extract_json_from_text()
Zeile 80
-
generate_semantic()
Zeile 102
-
save_semantic()
Zeile 154
-
main()
Zeile 187
Code
#!/usr/bin/env python3
"""
Generate semantic definitions for entities using Ollama.
Uses document context to create meaningful definitions.
"""
import json
import os
import sys
import time
import requests
# Add parent directory for imports
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from pipeline_config import get_step_model
# Configuration
OLLAMA_HOST = "http://localhost:11434"
DB_CONFIG = {
"host": "localhost",
"database": "ki_system",
"user": "root",
}
def get_db_password():
"""Read database password from credentials file."""
cred_file = "/var/www/docs/credentials/credentials.md"
with open(cred_file) as f:
for line in f:
if "MariaDB" in line and "root" in line:
parts = line.split("|")
if len(parts) >= 4:
return parts[3].strip()
return ""
def get_db_connection():
"""Create database connection."""
import mysql.connector
return mysql.connector.connect(
host=DB_CONFIG["host"],
database=DB_CONFIG["database"],
user=DB_CONFIG["user"],
password=get_db_password(),
)
def get_entities(conn):
"""Get all entities from database."""
cursor = conn.cursor(dictionary=True)
cursor.execute("""
SELECT e.id, e.name, e.type, e.description
FROM entities e
LEFT JOIN entity_semantics es ON e.id = es.entity_id
WHERE es.id IS NULL
ORDER BY e.id
""")
return cursor.fetchall()
def get_document_context(conn, limit=5):
"""Get document chunks for context."""
cursor = conn.cursor(dictionary=True)
cursor.execute(
"""
SELECT content FROM chunks
ORDER BY chunk_index
LIMIT %s
""",
(limit,),
)
chunks = cursor.fetchall()
return "\n\n".join([c["content"] for c in chunks])
def extract_json_from_text(text):
"""Extract JSON object from text response."""
import re
# Try to find JSON object in the text
# Look for { ... } pattern
match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
# Try parsing the whole text as JSON
try:
return json.loads(text)
except json.JSONDecodeError:
pass
return None
def generate_semantic(entity, context, model):
"""Generate semantic definition using Ollama."""
prompt = f"""Analysiere diese Entity und erstelle eine semantische Definition.
Entity: {entity["name"]} (Typ: {entity["type"]})
Kontext aus Dokument:
{context[:3000]}
Antworte mit einem JSON-Objekt:
{{"definition": "Bedeutung in 1-2 Sätzen", "domain": "Wissensdomäne", "context": "Verwendungskontext", "attributes": {{}}, "usage_notes": "", "confidence": 0.8}}
Nur das JSON-Objekt ausgeben, nichts anderes."""
try:
response = requests.post(
f"{OLLAMA_HOST}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.3, "num_predict": 800},
},
timeout=600, # 10 min
)
response.raise_for_status()
data = response.json()
response_text = data.get("response", "")
# Debug output
if not response_text:
print(" Empty response from Ollama")
return None
# Try to extract JSON from the response
result = extract_json_from_text(response_text)
if not result:
# If JSON extraction failed, try to create a basic semantic from the text
print(f" Could not parse JSON, raw: {response_text[:100]}...")
return None
return result
except requests.exceptions.Timeout:
print(" Timeout after 180s")
return None
except Exception as e:
print(f" Error: {e}")
return None
def save_semantic(conn, entity_id, semantic):
"""Save semantic to database."""
cursor = conn.cursor()
cursor.execute(
"""
INSERT INTO entity_semantics
(entity_id, definition, domain, context, attributes, usage_notes, confidence, source)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
definition = VALUES(definition),
domain = VALUES(domain),
context = VALUES(context),
attributes = VALUES(attributes),
usage_notes = VALUES(usage_notes),
confidence = VALUES(confidence),
source = VALUES(source),
updated_at = NOW()
""",
(
entity_id,
semantic.get("definition"),
semantic.get("domain"),
semantic.get("context"),
json.dumps(semantic.get("attributes", {}), ensure_ascii=False),
semantic.get("usage_notes"),
float(semantic.get("confidence", 0.5)),
"ollama",
),
)
conn.commit()
def main():
"""Main function."""
# Model aus DB-Config laden
model = get_step_model("knowledge_semantic_analyze")
print(f"Semantic Generation with {model}")
print("=" * 50)
conn = get_db_connection()
# Get entities without semantics
entities = get_entities(conn)
print(f"Entities without semantics: {len(entities)}")
if not entities:
print("All entities already have semantics.")
return
# Get document context
context = get_document_context(conn, limit=5)
print(f"Context loaded: {len(context)} chars")
print()
# Process each entity
success = 0
failed = 0
for i, entity in enumerate(entities, 1):
print(f"[{i}/{len(entities)}] {entity['name']} ({entity['type']})")
semantic = generate_semantic(entity, context, model)
if semantic and "definition" in semantic:
save_semantic(conn, entity["id"], semantic)
print(f" -> {semantic.get('domain', 'N/A')}: {semantic.get('definition', '')[:60]}...")
success += 1
else:
print(" -> FAILED")
failed += 1
# Small delay to avoid overwhelming Ollama
time.sleep(0.5)
print()
print("=" * 50)
print(f"Done! Success: {success}, Failed: {failed}")
conn.close()
if __name__ == "__main__":
main()