generate_semantics.py

Pfad: /var/www/scripts/pipeline/generate_semantics.py
Namespace: pipeline
Zeilen: 239 | Größe: 6,442 Bytes
Geändert: 2025-12-28 12:47:24 | Gescannt: 2025-12-31 10:22:15

Code Hygiene Score: 88

Dependencies: 70 (25%)
LOC: 87 (20%)
Methods: 100 (20%)
Secrets: 100 (15%)
Classes: 100 (10%)
Magic Numbers: 80 (10%)

Issues 2

Zeile	Typ	Beschreibung
142	magic_number	Magic Number gefunden: 100
221	magic_number	Magic Number gefunden: 60

Dependencies 8

use json
use os
use sys
use time
use requests
use pipeline_config.get_step_model
use mysql.connector
use re

Funktionen 8

get_db_password() Zeile 28
get_db_connection() Zeile 40
get_entities() Zeile 52
get_document_context() Zeile 65
extract_json_from_text() Zeile 80
generate_semantic() Zeile 102
save_semantic() Zeile 154
main() Zeile 187

Code

#!/usr/bin/env python3
"""
Generate semantic definitions for entities using Ollama.
Uses document context to create meaningful definitions.
"""

import json
import os
import sys
import time

import requests

# Add parent directory for imports
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from pipeline_config import get_step_model

# Configuration
OLLAMA_HOST = "http://localhost:11434"
DB_CONFIG = {
    "host": "localhost",
    "database": "ki_system",
    "user": "root",
}


def get_db_password():
    """Read database password from credentials file."""
    cred_file = "/var/www/docs/credentials/credentials.md"
    with open(cred_file) as f:
        for line in f:
            if "MariaDB" in line and "root" in line:
                parts = line.split("|")
                if len(parts) >= 4:
                    return parts[3].strip()
    return ""


def get_db_connection():
    """Create database connection."""
    import mysql.connector

    return mysql.connector.connect(
        host=DB_CONFIG["host"],
        database=DB_CONFIG["database"],
        user=DB_CONFIG["user"],
        password=get_db_password(),
    )


def get_entities(conn):
    """Get all entities from database."""
    cursor = conn.cursor(dictionary=True)
    cursor.execute("""
        SELECT e.id, e.name, e.type, e.description
        FROM entities e
        LEFT JOIN entity_semantics es ON e.id = es.entity_id
        WHERE es.id IS NULL
        ORDER BY e.id
    """)
    return cursor.fetchall()


def get_document_context(conn, limit=5):
    """Get document chunks for context."""
    cursor = conn.cursor(dictionary=True)
    cursor.execute(
        """
        SELECT content FROM chunks
        ORDER BY chunk_index
        LIMIT %s
    """,
        (limit,),
    )
    chunks = cursor.fetchall()
    return "\n\n".join([c["content"] for c in chunks])


def extract_json_from_text(text):
    """Extract JSON object from text response."""
    import re

    # Try to find JSON object in the text
    # Look for { ... } pattern
    match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            pass

    # Try parsing the whole text as JSON
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    return None


def generate_semantic(entity, context, model):
    """Generate semantic definition using Ollama."""
    prompt = f"""Analysiere diese Entity und erstelle eine semantische Definition.

Entity: {entity["name"]} (Typ: {entity["type"]})

Kontext aus Dokument:
{context[:3000]}

Antworte mit einem JSON-Objekt:
{{"definition": "Bedeutung in 1-2 Sätzen", "domain": "Wissensdomäne", "context": "Verwendungskontext", "attributes": {{}}, "usage_notes": "", "confidence": 0.8}}

Nur das JSON-Objekt ausgeben, nichts anderes."""

    try:
        response = requests.post(
            f"{OLLAMA_HOST}/api/generate",
            json={
                "model": model,
                "prompt": prompt,
                "stream": False,
                "options": {"temperature": 0.3, "num_predict": 800},
            },
            timeout=600,  # 10 min
        )
        response.raise_for_status()
        data = response.json()

        response_text = data.get("response", "")

        # Debug output
        if not response_text:
            print("  Empty response from Ollama")
            return None

        # Try to extract JSON from the response
        result = extract_json_from_text(response_text)

        if not result:
            # If JSON extraction failed, try to create a basic semantic from the text
            print(f"  Could not parse JSON, raw: {response_text[:100]}...")
            return None

        return result
    except requests.exceptions.Timeout:
        print("  Timeout after 180s")
        return None
    except Exception as e:
        print(f"  Error: {e}")
        return None


def save_semantic(conn, entity_id, semantic):
    """Save semantic to database."""
    cursor = conn.cursor()
    cursor.execute(
        """
        INSERT INTO entity_semantics
            (entity_id, definition, domain, context, attributes, usage_notes, confidence, source)
        VALUES
            (%s, %s, %s, %s, %s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE
            definition = VALUES(definition),
            domain = VALUES(domain),
            context = VALUES(context),
            attributes = VALUES(attributes),
            usage_notes = VALUES(usage_notes),
            confidence = VALUES(confidence),
            source = VALUES(source),
            updated_at = NOW()
    """,
        (
            entity_id,
            semantic.get("definition"),
            semantic.get("domain"),
            semantic.get("context"),
            json.dumps(semantic.get("attributes", {}), ensure_ascii=False),
            semantic.get("usage_notes"),
            float(semantic.get("confidence", 0.5)),
            "ollama",
        ),
    )
    conn.commit()


def main():
    """Main function."""
    # Model aus DB-Config laden
    model = get_step_model("knowledge_semantic_analyze")

    print(f"Semantic Generation with {model}")
    print("=" * 50)

    conn = get_db_connection()

    # Get entities without semantics
    entities = get_entities(conn)
    print(f"Entities without semantics: {len(entities)}")

    if not entities:
        print("All entities already have semantics.")
        return

    # Get document context
    context = get_document_context(conn, limit=5)
    print(f"Context loaded: {len(context)} chars")
    print()

    # Process each entity
    success = 0
    failed = 0

    for i, entity in enumerate(entities, 1):
        print(f"[{i}/{len(entities)}] {entity['name']} ({entity['type']})")

        semantic = generate_semantic(entity, context, model)

        if semantic and "definition" in semantic:
            save_semantic(conn, entity["id"], semantic)
            print(f"  -> {semantic.get('domain', 'N/A')}: {semantic.get('definition', '')[:60]}...")
            success += 1
        else:
            print("  -> FAILED")
            failed += 1

        # Small delay to avoid overwhelming Ollama
        time.sleep(0.5)

    print()
    print("=" * 50)
    print(f"Done! Success: {success}, Failed: {failed}")

    conn.close()


if __name__ == "__main__":
    main()

← Übersicht