semantic_extractor.py

Code Hygiene Score: 98

Keine Issues gefunden.

Dependencies 6

Klassen 1

Code

"""Semantik-Extraktion für Wissensextraktion."""

import json
import sys

sys.path.insert(0, "/var/www/scripts/pipeline")

from db import db

from .llm_service import LLMService
from .models import KnowledgeLevel, KnowledgeType


class SemanticExtractor:
    """Extrahiert Semantik (Bedeutung/Definition) für Entitäten."""

    def __init__(self, llm_service: LLMService, store_knowledge_fn):
        """Initialisiere mit LLM-Service und Storage-Funktion."""
        self.llm = llm_service
        self.store_knowledge = store_knowledge_fn

    def extract_semantics(self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
        """
        Extrahiere Semantik (Bedeutung/Definition) für Entitäten.

        Args:
            entities: Liste der extrahierten Entitäten
            text: Ursprungstext für Kontext
            level: Ebene
            source_id: Quell-ID

        Returns:
            Liste von Semantik-Einträgen
        """
        if not entities:
            return []

        entity_names = [e["name"] for e in entities[:15]]

        prompt = f"""Für die folgenden Entitäten aus dem Text, extrahiere die Bedeutung/Definition.

Entitäten: {", ".join(entity_names)}

Für jede Entität gib an:
- definition: Kurze Definition basierend auf dem Text
- context: In welchem Kontext wird sie verwendet
- references: Bezüge zu anderen Konzepten (falls erkennbar)

Antworte NUR als JSON:
{{"semantics": [
    {{"entity": "...", "definition": "...", "context": "...", "references": ["..."]}}
]}}

Text:
{text[:3000]}"""

        result = self.llm.call_llm(prompt)
        data = self.llm.parse_json(result)
        semantics = data.get("semantics", [])

        # Speichere Semantik
        stored = []
        for sem in semantics:
            entity_name = sem.get("entity", "")
            entity_match = next((e for e in entities if e["name"].lower() == entity_name.lower()), None)
            if entity_match:
                stored_sem = self._store_semantic(
                    entity_id=entity_match["id"],
                    definition=sem.get("definition", ""),
                    context=sem.get("context", ""),
                    references=sem.get("references", []),
                    level=level,
                    source_id=source_id,
                )
                if stored_sem:
                    stored.append(stored_sem)

        # Speichere in Knowledge-Tabelle
        self.store_knowledge(
            level,
            source_id,
            KnowledgeType.SEMANTIC,
            {"definitions": len(stored), "entities": [s["entity_name"] for s in stored]},
        )

        return stored

    def _store_semantic(
        self, entity_id: int, definition: str, context: str, references: list, level: KnowledgeLevel, source_id: int
    ) -> dict | None:
        """Speichere Semantik-Eintrag mit Abgleich."""
        try:
            # Prüfe ob bereits Semantik existiert
            cursor = db.execute(
                """SELECT id, definition FROM entity_semantics
                   WHERE entity_id = %s AND source_type = %s AND source_id = %s""",
                (entity_id, level.value, source_id),
            )
            existing = cursor.fetchone()
            cursor.close()

            model_name = f"{self.llm.model.provider}:{self.llm.model.model_name}"

            if existing:
                if existing["definition"] != definition:
                    db.log("INFO", f"Abweichende Definition für Entität {entity_id}, füge hinzu")
                    cursor = db.execute(
                        """INSERT INTO entity_semantics
                           (entity_id, definition, context, references_json,
                            source_type, source_id, model_used, created_at)
                           VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())""",
                        (entity_id, definition, context, json.dumps(references), level.value, source_id, model_name),
                    )
                    db.commit()
                    sem_id = cursor.lastrowid
                    cursor.close()
                else:
                    sem_id = existing["id"]
            else:
                cursor = db.execute(
                    """INSERT INTO entity_semantics
                       (entity_id, definition, context, references_json,
                        source_type, source_id, model_used, created_at)
                       VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())""",
                    (entity_id, definition, context, json.dumps(references), level.value, source_id, model_name),
                )
                db.commit()
                sem_id = cursor.lastrowid
                cursor.close()

            # Hole Entity-Name für Rückgabe
            cursor = db.execute("SELECT name FROM entities WHERE id = %s", (entity_id,))
            entity = cursor.fetchone()
            cursor.close()

            return {
                "id": sem_id,
                "entity_id": entity_id,
                "entity_name": entity["name"] if entity else "",
                "definition": definition,
            }

        except Exception as e:
            db.log("ERROR", f"Fehler beim Speichern der Semantik: {e}")
            return None
← Übersicht Graph