generate_entity_descriptions.py
- Pfad:
/var/www/scripts/pipeline/generate_entity_descriptions.py
- Namespace: pipeline
- Zeilen: 221 | Größe: 6,329 Bytes
- Geändert: 2025-12-31 03:01:09 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 88
- Dependencies: 60 (25%)
- LOC: 93 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 90 (10%)
Issues 1
| Zeile |
Typ |
Beschreibung |
| 196 |
magic_number |
Magic Number gefunden: 100 |
Dependencies 9
- use argparse
- use os
- use sys
- use time
- use requests
- use constants.DEFAULT_LIMIT
- use constants.OLLAMA_TIMEOUT
- use db.Database
- use pipeline_config.get_step_model
Funktionen 6
-
get_prompt_template()
Zeile 34
-
get_entities_to_enrich()
Zeile 46
-
get_entity_context()
Zeile 64
-
call_ollama()
Zeile 85
-
update_entity_description()
Zeile 109
-
main()
Zeile 125
Code
#!/usr/bin/env python3
"""
Entity Description Enrichment Script
=====================================
Generiert ausführliche Beschreibungen für Entitäten via Ollama.
Voraussetzungen:
- Ollama läuft lokal (http://localhost:11434)
- Prompt in ki_content.prompts (use_case='entity_description')
Usage:
python generate_entity_descriptions.py [--model MODEL] [--limit N] [--dry-run]
"""
import argparse
import os
import sys
import time
import requests
# Add parent directory for imports
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from constants import DEFAULT_LIMIT, OLLAMA_TIMEOUT
from db import Database
from pipeline_config import get_step_model
# Configuration
OLLAMA_URL = "http://localhost:11434/api/generate"
MIN_DESCRIPTION_LENGTH = 50 # Entities with description < this get enriched
def get_prompt_template(db: Database) -> str | None:
"""Load prompt template from database."""
cursor = db.execute("""
SELECT content FROM prompts
WHERE use_case = 'entity_description' AND is_active = 1
ORDER BY id DESC LIMIT 1
""")
row = cursor.fetchone()
cursor.close()
return row["content"] if row else None
def get_entities_to_enrich(db: Database, limit: int = DEFAULT_LIMIT) -> list:
"""Get entities with short or missing descriptions."""
cursor = db.execute(
"""
SELECT id, name, type, description
FROM entities
WHERE description IS NULL
OR CHAR_LENGTH(description) < %s
ORDER BY id
LIMIT %s
""",
(MIN_DESCRIPTION_LENGTH, limit),
)
entities = cursor.fetchall()
cursor.close()
return entities
def get_entity_context(db: Database, entity_id: int, max_chunks: int = 3) -> str:
"""Get context from chunks where this entity appears."""
cursor = db.execute(
"""
SELECT c.content
FROM chunk_entities ce
JOIN chunks c ON ce.chunk_id = c.id
WHERE ce.entity_id = %s
LIMIT %s
""",
(entity_id, max_chunks),
)
chunks = cursor.fetchall()
cursor.close()
if not chunks:
return "(Kein Kontext verfügbar)"
return "\n\n---\n\n".join(chunk["content"][:500] for chunk in chunks)
def call_ollama(prompt: str, model: str) -> str | None:
"""Call Ollama API and return generated text."""
try:
response = requests.post(
OLLAMA_URL,
json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.7,
"num_predict": 300,
},
},
timeout=OLLAMA_TIMEOUT,
)
response.raise_for_status()
result = response.json()
return result.get("response", "").strip()
except requests.exceptions.RequestException as e:
print(f" ✗ Ollama error: {e}")
return None
def update_entity_description(db: Database, entity_id: int, description: str) -> bool:
"""Update entity description in database."""
try:
db.execute(
"""
UPDATE entities SET description = %s WHERE id = %s
""",
(description, entity_id),
)
db.commit()
return True
except Exception as e:
print(f" ✗ DB error: {e}")
return False
def main():
"""Generate entity descriptions using Ollama LLM."""
parser = argparse.ArgumentParser(description="Generate entity descriptions via Ollama")
parser.add_argument("--model", default=None, help="Ollama model (default: from DB config)")
parser.add_argument("--limit", type=int, default=50, help="Max entities to process")
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
parser.add_argument("--verbose", "-v", action="store_true", help="Show generated descriptions")
args = parser.parse_args()
# Model aus DB-Config laden wenn nicht explizit übergeben
model = args.model if args.model else get_step_model("enrich")
print("Entity Description Enrichment")
print(f"Model: {model}")
print(f"Limit: {args.limit}")
print("-" * 50)
# Connect to database
db = Database()
if not db.connect():
print("✗ Could not connect to database")
return 1
# Get prompt template
prompt_template = get_prompt_template(db)
if not prompt_template:
print("✗ No active prompt found for use_case='entity_description'")
db.disconnect()
return 1
print("✓ Prompt loaded")
# Get entities to enrich
entities = get_entities_to_enrich(db, args.limit)
print(f"✓ Found {len(entities)} entities with short descriptions")
if not entities:
print("Nothing to do.")
db.disconnect()
return 0
# Process entities
success_count = 0
error_count = 0
for i, entity in enumerate(entities, 1):
print(f"\n[{i}/{len(entities)}] {entity['name']} ({entity['type']})")
# Get context
context = get_entity_context(db, entity["id"])
# Build prompt
prompt = prompt_template.format(
entity_name=entity["name"],
entity_type=entity["type"],
context=context[:1500], # Limit context size
)
if args.dry_run:
print(" Would generate description...")
print(f" Context: {len(context)} chars")
continue
# Call Ollama
description = call_ollama(prompt, model)
if not description:
error_count += 1
continue
if args.verbose:
print(f" Generated: {description[:100]}...")
# Update database
if update_entity_description(db, entity["id"], description):
success_count += 1
print(f" ✓ Updated ({len(description)} chars)")
else:
error_count += 1
# Small delay to not overload Ollama
time.sleep(0.5)
# Summary
print("\n" + "=" * 50)
print("Summary:")
print(f" Processed: {len(entities)}")
print(f" Success: {success_count}")
print(f" Errors: {error_count}")
db.disconnect()
return 0 if error_count == 0 else 1
if __name__ == "__main__":
sys.exit(main())