taxonomy_extractor.py

Code Hygiene Score: 98

Keine Issues gefunden.

Dependencies 6

Klassen 1

Code

"""Taxonomie-Extraktion für Wissensextraktion."""

import re
import sys

sys.path.insert(0, "/var/www/scripts/pipeline")

from db import db

from .llm_service import LLMService
from .models import KnowledgeLevel, KnowledgeType


class TaxonomyExtractor:
    """Extrahiert Taxonomie (hierarchische Einordnung) für Entitäten."""

    def __init__(self, llm_service: LLMService, store_knowledge_fn):
        """Initialisiere mit LLM-Service und Storage-Funktion."""
        self.llm = llm_service
        self.store_knowledge = store_knowledge_fn

    def extract_taxonomy(self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
        """
        Extrahiere Taxonomie (hierarchische Einordnung) für Entitäten.

        Args:
            entities: Liste der Entitäten
            text: Ursprungstext
            level: Ebene
            source_id: Quell-ID

        Returns:
            Liste von Taxonomie-Zuordnungen
        """
        if not entities:
            return []

        # Hole existierende Taxonomie-Terme
        cursor = db.execute("SELECT id, name, path, depth FROM taxonomy_terms ORDER BY depth, name")
        existing_terms = cursor.fetchall()
        cursor.close()

        term_names = [t["name"] for t in existing_terms]
        entity_names = [e["name"] for e in entities[:15]]

        prompt = f"""Ordne die folgenden Entitäten in eine hierarchische Taxonomie ein.

Entitäten: {", ".join(entity_names)}

Existierende Taxonomie-Kategorien: {", ".join(term_names) if term_names else "Keine vorhanden"}

Aufgabe:
1. Ordne jede Entität einer passenden Kategorie zu
2. Wenn keine passende Kategorie existiert, schlage eine neue vor
3. Gib die hierarchische Einordnung an

Antworte NUR als JSON:
{{"mappings": [
    {{"entity": "...", "category": "...", "parent_category": null, "confidence": 0.0-1.0, "is_new_category": false}}
]}}

Text-Kontext:
{text[:2000]}"""

        result = self.llm.call_llm(prompt)
        data = self.llm.parse_json(result)
        mappings = data.get("mappings", [])

        # Speichere Taxonomie-Zuordnungen
        stored = []
        for mapping in mappings:
            entity_match = next((e for e in entities if e["name"].lower() == mapping.get("entity", "").lower()), None)
            if entity_match:
                stored_mapping = self._store_taxonomy_mapping(
                    entity_id=entity_match["id"],
                    category_name=mapping.get("category", ""),
                    parent_category=mapping.get("parent_category"),
                    confidence=mapping.get("confidence", 0.8),
                    is_new=mapping.get("is_new_category", False),
                    existing_terms=existing_terms,
                    level=level,
                    source_id=source_id,
                )
                if stored_mapping:
                    stored.append(stored_mapping)

        # Speichere in Knowledge-Tabelle
        self.store_knowledge(
            level,
            source_id,
            KnowledgeType.TAXONOMY,
            {"mappings": len(stored), "categories": list({m["category"] for m in stored})},
        )

        return stored

    def _store_taxonomy_mapping(
        self,
        entity_id: int,
        category_name: str,
        parent_category: str | None,
        confidence: float,
        is_new: bool,
        existing_terms: list,
        level: KnowledgeLevel,
        source_id: int,
    ) -> dict | None:
        """Speichere Taxonomie-Zuordnung."""
        try:
            # Finde oder erstelle Taxonomie-Term
            term = next((t for t in existing_terms if t["name"].lower() == category_name.lower()), None)

            if term:
                term_id = term["id"]
            elif is_new:
                # Neuen Term anlegen
                parent_id = None
                depth = 0
                path = f"/{category_name}"

                if parent_category:
                    parent_term = next(
                        (t for t in existing_terms if t["name"].lower() == parent_category.lower()), None
                    )
                    if parent_term:
                        parent_id = parent_term["id"]
                        depth = parent_term["depth"] + 1
                        path = f"{parent_term['path']}/{category_name}"

                # Erstelle Slug
                slug = re.sub(r"[^a-z0-9]+", "-", category_name.lower()).strip("-")

                cursor = db.execute(
                    """INSERT INTO taxonomy_terms (name, slug, parent_id, depth, path, created_at)
                       VALUES (%s, %s, %s, %s, %s, NOW())""",
                    (category_name, slug, parent_id, depth, path),
                )
                db.commit()
                term_id = cursor.lastrowid
                cursor.close()
                db.log("INFO", f"Neuer Taxonomie-Term: '{category_name}' (ID: {term_id})")
            else:
                return None

            model_name = f"{self.llm.model.provider}:{self.llm.model.model_name}"

            # Speichere Zuordnung
            cursor = db.execute(
                """INSERT INTO entity_taxonomy_mapping
                   (entity_id, taxonomy_term_id, confidence, source_type, source_id, model_used, created_at)
                   VALUES (%s, %s, %s, %s, %s, %s, NOW())
                   ON DUPLICATE KEY UPDATE confidence = VALUES(confidence)""",
                (entity_id, term_id, confidence, level.value, source_id, model_name),
            )
            db.commit()
            cursor.close()

            return {"entity_id": entity_id, "term_id": term_id, "category": category_name, "confidence": confidence}

        except Exception as e:
            db.log("ERROR", f"Fehler beim Speichern der Taxonomie: {e}")
            return None
← Übersicht Graph