generate_entity_descriptions.py

Pfad: /var/www/scripts/pipeline/generate_entity_descriptions.py
Namespace: pipeline
Zeilen: 221 | Größe: 6,329 Bytes
Geändert: 2025-12-31 03:01:09 | Gescannt: 2025-12-31 10:22:15

Code Hygiene Score: 88

Dependencies: 60 (25%)
LOC: 93 (20%)
Methods: 100 (20%)
Secrets: 100 (15%)
Classes: 100 (10%)
Magic Numbers: 90 (10%)

Issues 1

Zeile	Typ	Beschreibung
196	magic_number	Magic Number gefunden: 100

Dependencies 9

use argparse
use os
use sys
use time
use requests
use constants.DEFAULT_LIMIT
use constants.OLLAMA_TIMEOUT
use db.Database
use pipeline_config.get_step_model

Funktionen 6

get_prompt_template() Zeile 34
get_entities_to_enrich() Zeile 46
get_entity_context() Zeile 64
call_ollama() Zeile 85
update_entity_description() Zeile 109
main() Zeile 125

Code

#!/usr/bin/env python3
"""
Entity Description Enrichment Script
=====================================
Generiert ausführliche Beschreibungen für Entitäten via Ollama.

Voraussetzungen:
- Ollama läuft lokal (http://localhost:11434)
- Prompt in ki_content.prompts (use_case='entity_description')

Usage:
    python generate_entity_descriptions.py [--model MODEL] [--limit N] [--dry-run]
"""

import argparse
import os
import sys
import time

import requests

# Add parent directory for imports
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from constants import DEFAULT_LIMIT, OLLAMA_TIMEOUT
from db import Database
from pipeline_config import get_step_model

# Configuration
OLLAMA_URL = "http://localhost:11434/api/generate"
MIN_DESCRIPTION_LENGTH = 50  # Entities with description < this get enriched


def get_prompt_template(db: Database) -> str | None:
    """Load prompt template from database."""
    cursor = db.execute("""
        SELECT content FROM prompts
        WHERE use_case = 'entity_description' AND is_active = 1
        ORDER BY id DESC LIMIT 1
    """)
    row = cursor.fetchone()
    cursor.close()
    return row["content"] if row else None


def get_entities_to_enrich(db: Database, limit: int = DEFAULT_LIMIT) -> list:
    """Get entities with short or missing descriptions."""
    cursor = db.execute(
        """
        SELECT id, name, type, description
        FROM entities
        WHERE description IS NULL
           OR CHAR_LENGTH(description) < %s
        ORDER BY id
        LIMIT %s
    """,
        (MIN_DESCRIPTION_LENGTH, limit),
    )
    entities = cursor.fetchall()
    cursor.close()
    return entities


def get_entity_context(db: Database, entity_id: int, max_chunks: int = 3) -> str:
    """Get context from chunks where this entity appears."""
    cursor = db.execute(
        """
        SELECT c.content
        FROM chunk_entities ce
        JOIN chunks c ON ce.chunk_id = c.id
        WHERE ce.entity_id = %s
        LIMIT %s
    """,
        (entity_id, max_chunks),
    )
    chunks = cursor.fetchall()
    cursor.close()

    if not chunks:
        return "(Kein Kontext verfügbar)"

    return "\n\n---\n\n".join(chunk["content"][:500] for chunk in chunks)


def call_ollama(prompt: str, model: str) -> str | None:
    """Call Ollama API and return generated text."""
    try:
        response = requests.post(
            OLLAMA_URL,
            json={
                "model": model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.7,
                    "num_predict": 300,
                },
            },
            timeout=OLLAMA_TIMEOUT,
        )
        response.raise_for_status()
        result = response.json()
        return result.get("response", "").strip()
    except requests.exceptions.RequestException as e:
        print(f"  ✗ Ollama error: {e}")
        return None


def update_entity_description(db: Database, entity_id: int, description: str) -> bool:
    """Update entity description in database."""
    try:
        db.execute(
            """
            UPDATE entities SET description = %s WHERE id = %s
        """,
            (description, entity_id),
        )
        db.commit()
        return True
    except Exception as e:
        print(f"  ✗ DB error: {e}")
        return False


def main():
    """Generate entity descriptions using Ollama LLM."""
    parser = argparse.ArgumentParser(description="Generate entity descriptions via Ollama")
    parser.add_argument("--model", default=None, help="Ollama model (default: from DB config)")
    parser.add_argument("--limit", type=int, default=50, help="Max entities to process")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
    parser.add_argument("--verbose", "-v", action="store_true", help="Show generated descriptions")
    args = parser.parse_args()

    # Model aus DB-Config laden wenn nicht explizit übergeben
    model = args.model if args.model else get_step_model("enrich")

    print("Entity Description Enrichment")
    print(f"Model: {model}")
    print(f"Limit: {args.limit}")
    print("-" * 50)

    # Connect to database
    db = Database()
    if not db.connect():
        print("✗ Could not connect to database")
        return 1

    # Get prompt template
    prompt_template = get_prompt_template(db)
    if not prompt_template:
        print("✗ No active prompt found for use_case='entity_description'")
        db.disconnect()
        return 1

    print("✓ Prompt loaded")

    # Get entities to enrich
    entities = get_entities_to_enrich(db, args.limit)
    print(f"✓ Found {len(entities)} entities with short descriptions")

    if not entities:
        print("Nothing to do.")
        db.disconnect()
        return 0

    # Process entities
    success_count = 0
    error_count = 0

    for i, entity in enumerate(entities, 1):
        print(f"\n[{i}/{len(entities)}] {entity['name']} ({entity['type']})")

        # Get context
        context = get_entity_context(db, entity["id"])

        # Build prompt
        prompt = prompt_template.format(
            entity_name=entity["name"],
            entity_type=entity["type"],
            context=context[:1500],  # Limit context size
        )

        if args.dry_run:
            print("  Would generate description...")
            print(f"  Context: {len(context)} chars")
            continue

        # Call Ollama
        description = call_ollama(prompt, model)

        if not description:
            error_count += 1
            continue

        if args.verbose:
            print(f"  Generated: {description[:100]}...")

        # Update database
        if update_entity_description(db, entity["id"], description):
            success_count += 1
            print(f"  ✓ Updated ({len(description)} chars)")
        else:
            error_count += 1

        # Small delay to not overload Ollama
        time.sleep(0.5)

    # Summary
    print("\n" + "=" * 50)
    print("Summary:")
    print(f"  Processed: {len(entities)}")
    print(f"  Success: {success_count}")
    print(f"  Errors: {error_count}")

    db.disconnect()
    return 0 if error_count == 0 else 1


if __name__ == "__main__":
    sys.exit(main())

← Übersicht