taxonomy_extractor.py
- Pfad:
/var/www/scripts/pipeline/knowledge/taxonomy_extractor.py - Namespace: pipeline
- Zeilen: 163 | Größe: 5,826 Bytes
- Geändert: 2025-12-25 15:50:37 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 98
- Dependencies: 90 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 6
- use re
- use sys
- use db.db
- use llm_service.LLMService
- use models.KnowledgeLevel
- use models.KnowledgeType
Klassen 1
-
TaxonomyExtractorclass Zeile 14
Code
"""Taxonomie-Extraktion für Wissensextraktion."""
import re
import sys
sys.path.insert(0, "/var/www/scripts/pipeline")
from db import db
from .llm_service import LLMService
from .models import KnowledgeLevel, KnowledgeType
class TaxonomyExtractor:
"""Extrahiert Taxonomie (hierarchische Einordnung) für Entitäten."""
def __init__(self, llm_service: LLMService, store_knowledge_fn):
"""Initialisiere mit LLM-Service und Storage-Funktion."""
self.llm = llm_service
self.store_knowledge = store_knowledge_fn
def extract_taxonomy(self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
"""
Extrahiere Taxonomie (hierarchische Einordnung) für Entitäten.
Args:
entities: Liste der Entitäten
text: Ursprungstext
level: Ebene
source_id: Quell-ID
Returns:
Liste von Taxonomie-Zuordnungen
"""
if not entities:
return []
# Hole existierende Taxonomie-Terme
cursor = db.execute("SELECT id, name, path, depth FROM taxonomy_terms ORDER BY depth, name")
existing_terms = cursor.fetchall()
cursor.close()
term_names = [t["name"] for t in existing_terms]
entity_names = [e["name"] for e in entities[:15]]
prompt = f"""Ordne die folgenden Entitäten in eine hierarchische Taxonomie ein.
Entitäten: {", ".join(entity_names)}
Existierende Taxonomie-Kategorien: {", ".join(term_names) if term_names else "Keine vorhanden"}
Aufgabe:
1. Ordne jede Entität einer passenden Kategorie zu
2. Wenn keine passende Kategorie existiert, schlage eine neue vor
3. Gib die hierarchische Einordnung an
Antworte NUR als JSON:
{{"mappings": [
{{"entity": "...", "category": "...", "parent_category": null, "confidence": 0.0-1.0, "is_new_category": false}}
]}}
Text-Kontext:
{text[:2000]}"""
result = self.llm.call_llm(prompt)
data = self.llm.parse_json(result)
mappings = data.get("mappings", [])
# Speichere Taxonomie-Zuordnungen
stored = []
for mapping in mappings:
entity_match = next((e for e in entities if e["name"].lower() == mapping.get("entity", "").lower()), None)
if entity_match:
stored_mapping = self._store_taxonomy_mapping(
entity_id=entity_match["id"],
category_name=mapping.get("category", ""),
parent_category=mapping.get("parent_category"),
confidence=mapping.get("confidence", 0.8),
is_new=mapping.get("is_new_category", False),
existing_terms=existing_terms,
level=level,
source_id=source_id,
)
if stored_mapping:
stored.append(stored_mapping)
# Speichere in Knowledge-Tabelle
self.store_knowledge(
level,
source_id,
KnowledgeType.TAXONOMY,
{"mappings": len(stored), "categories": list({m["category"] for m in stored})},
)
return stored
def _store_taxonomy_mapping(
self,
entity_id: int,
category_name: str,
parent_category: str | None,
confidence: float,
is_new: bool,
existing_terms: list,
level: KnowledgeLevel,
source_id: int,
) -> dict | None:
"""Speichere Taxonomie-Zuordnung."""
try:
# Finde oder erstelle Taxonomie-Term
term = next((t for t in existing_terms if t["name"].lower() == category_name.lower()), None)
if term:
term_id = term["id"]
elif is_new:
# Neuen Term anlegen
parent_id = None
depth = 0
path = f"/{category_name}"
if parent_category:
parent_term = next(
(t for t in existing_terms if t["name"].lower() == parent_category.lower()), None
)
if parent_term:
parent_id = parent_term["id"]
depth = parent_term["depth"] + 1
path = f"{parent_term['path']}/{category_name}"
# Erstelle Slug
slug = re.sub(r"[^a-z0-9]+", "-", category_name.lower()).strip("-")
cursor = db.execute(
"""INSERT INTO taxonomy_terms (name, slug, parent_id, depth, path, created_at)
VALUES (%s, %s, %s, %s, %s, NOW())""",
(category_name, slug, parent_id, depth, path),
)
db.commit()
term_id = cursor.lastrowid
cursor.close()
db.log("INFO", f"Neuer Taxonomie-Term: '{category_name}' (ID: {term_id})")
else:
return None
model_name = f"{self.llm.model.provider}:{self.llm.model.model_name}"
# Speichere Zuordnung
cursor = db.execute(
"""INSERT INTO entity_taxonomy_mapping
(entity_id, taxonomy_term_id, confidence, source_type, source_id, model_used, created_at)
VALUES (%s, %s, %s, %s, %s, %s, NOW())
ON DUPLICATE KEY UPDATE confidence = VALUES(confidence)""",
(entity_id, term_id, confidence, level.value, source_id, model_name),
)
db.commit()
cursor.close()
return {"entity_id": entity_id, "term_id": term_id, "category": category_name, "confidence": confidence}
except Exception as e:
db.log("ERROR", f"Fehler beim Speichern der Taxonomie: {e}")
return None