semantic_extractor.py
- Pfad:
/var/www/scripts/pipeline/knowledge/semantic_extractor.py - Namespace: pipeline
- Zeilen: 146 | Größe: 5,296 Bytes
- Geändert: 2025-12-25 15:50:37 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 98
- Dependencies: 90 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 6
- use json
- use sys
- use db.db
- use llm_service.LLMService
- use models.KnowledgeLevel
- use models.KnowledgeType
Klassen 1
-
SemanticExtractorclass Zeile 14
Code
"""Semantik-Extraktion für Wissensextraktion."""
import json
import sys
sys.path.insert(0, "/var/www/scripts/pipeline")
from db import db
from .llm_service import LLMService
from .models import KnowledgeLevel, KnowledgeType
class SemanticExtractor:
"""Extrahiert Semantik (Bedeutung/Definition) für Entitäten."""
def __init__(self, llm_service: LLMService, store_knowledge_fn):
"""Initialisiere mit LLM-Service und Storage-Funktion."""
self.llm = llm_service
self.store_knowledge = store_knowledge_fn
def extract_semantics(self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
"""
Extrahiere Semantik (Bedeutung/Definition) für Entitäten.
Args:
entities: Liste der extrahierten Entitäten
text: Ursprungstext für Kontext
level: Ebene
source_id: Quell-ID
Returns:
Liste von Semantik-Einträgen
"""
if not entities:
return []
entity_names = [e["name"] for e in entities[:15]]
prompt = f"""Für die folgenden Entitäten aus dem Text, extrahiere die Bedeutung/Definition.
Entitäten: {", ".join(entity_names)}
Für jede Entität gib an:
- definition: Kurze Definition basierend auf dem Text
- context: In welchem Kontext wird sie verwendet
- references: Bezüge zu anderen Konzepten (falls erkennbar)
Antworte NUR als JSON:
{{"semantics": [
{{"entity": "...", "definition": "...", "context": "...", "references": ["..."]}}
]}}
Text:
{text[:3000]}"""
result = self.llm.call_llm(prompt)
data = self.llm.parse_json(result)
semantics = data.get("semantics", [])
# Speichere Semantik
stored = []
for sem in semantics:
entity_name = sem.get("entity", "")
entity_match = next((e for e in entities if e["name"].lower() == entity_name.lower()), None)
if entity_match:
stored_sem = self._store_semantic(
entity_id=entity_match["id"],
definition=sem.get("definition", ""),
context=sem.get("context", ""),
references=sem.get("references", []),
level=level,
source_id=source_id,
)
if stored_sem:
stored.append(stored_sem)
# Speichere in Knowledge-Tabelle
self.store_knowledge(
level,
source_id,
KnowledgeType.SEMANTIC,
{"definitions": len(stored), "entities": [s["entity_name"] for s in stored]},
)
return stored
def _store_semantic(
self, entity_id: int, definition: str, context: str, references: list, level: KnowledgeLevel, source_id: int
) -> dict | None:
"""Speichere Semantik-Eintrag mit Abgleich."""
try:
# Prüfe ob bereits Semantik existiert
cursor = db.execute(
"""SELECT id, definition FROM entity_semantics
WHERE entity_id = %s AND source_type = %s AND source_id = %s""",
(entity_id, level.value, source_id),
)
existing = cursor.fetchone()
cursor.close()
model_name = f"{self.llm.model.provider}:{self.llm.model.model_name}"
if existing:
if existing["definition"] != definition:
db.log("INFO", f"Abweichende Definition für Entität {entity_id}, füge hinzu")
cursor = db.execute(
"""INSERT INTO entity_semantics
(entity_id, definition, context, references_json,
source_type, source_id, model_used, created_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())""",
(entity_id, definition, context, json.dumps(references), level.value, source_id, model_name),
)
db.commit()
sem_id = cursor.lastrowid
cursor.close()
else:
sem_id = existing["id"]
else:
cursor = db.execute(
"""INSERT INTO entity_semantics
(entity_id, definition, context, references_json,
source_type, source_id, model_used, created_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())""",
(entity_id, definition, context, json.dumps(references), level.value, source_id, model_name),
)
db.commit()
sem_id = cursor.lastrowid
cursor.close()
# Hole Entity-Name für Rückgabe
cursor = db.execute("SELECT name FROM entities WHERE id = %s", (entity_id,))
entity = cursor.fetchone()
cursor.close()
return {
"id": sem_id,
"entity_id": entity_id,
"entity_name": entity["name"] if entity else "",
"definition": definition,
}
except Exception as e:
db.log("ERROR", f"Fehler beim Speichern der Semantik: {e}")
return None