"""
Analyzer classes for semantic chunk analysis.
"""
from .models import ChunkSemantics, Entity, Relation
from .ollama_service import OllamaService
class SemanticsAnalyzer:
"""Analysiert Chunk-Semantik: Summary, Keywords, Sentiment."""
PROMPT = """Analysiere diesen deutschen Text und erstelle eine semantische Analyse.
Text:
{text}
Antworte NUR als JSON:
{{
"summary": "Zusammenfassung in 1-2 Sätzen",
"keywords": ["keyword1", "keyword2", "keyword3"],
"sentiment": "positive|neutral|negative|mixed",
"topics": ["thema1", "thema2"],
"language": "de|en"
}}"""
def __init__(self, llm: OllamaService):
self.llm = llm
def analyze(self, chunk_id: int, text: str) -> ChunkSemantics | None:
"""Analysiere einen Chunk."""
result = self.llm.generate(self.PROMPT.format(text=text[:2000]))
if not result:
return None
summary = result.get("summary", "")
if isinstance(summary, list):
summary = summary[0] if summary else ""
keywords = result.get("keywords", [])
if not isinstance(keywords, list):
keywords = [str(keywords)] if keywords else []
keywords = [str(k) for k in keywords if k and not isinstance(k, (list, dict))][:10]
topics = result.get("topics", [])
if not isinstance(topics, list):
topics = [str(topics)] if topics else []
topics = [str(t) for t in topics if t and not isinstance(t, (list, dict))][:5]
language = result.get("language", "de")
if isinstance(language, list):
language = language[0] if language else "de"
return ChunkSemantics(
chunk_id=chunk_id,
summary=str(summary)[:1000],
keywords=keywords,
sentiment=self._validate_sentiment(result.get("sentiment", "neutral")),
topics=topics,
language=str(language)[:5],
)
def _validate_sentiment(self, sentiment) -> str:
"""Validiere Sentiment-Wert."""
if isinstance(sentiment, list):
sentiment = sentiment[0] if sentiment else "neutral"
if not isinstance(sentiment, str):
return "neutral"
valid = {"positive", "neutral", "negative", "mixed"}
return sentiment.lower() if sentiment.lower() in valid else "neutral"
class EntityExtractor:
"""Extrahiert Entitäten aus Text."""
PROMPT = """Extrahiere alle wichtigen Entitäten aus diesem deutschen Text.
Kategorien:
- PERSON: Namen von Personen
- ORGANIZATION: Firmen, Institutionen
- CONCEPT: Fachbegriffe, Methoden, Theorien
- LOCATION: Orte, Länder
- OTHER: Sonstiges
Text:
{text}
Antworte NUR als JSON:
{{
"entities": [
{{"name": "Name", "type": "CONCEPT", "description": "Kurze Beschreibung"}}
]
}}"""
def __init__(self, llm: OllamaService):
self.llm = llm
def extract(self, text: str) -> list[Entity]:
"""Extrahiere Entitäten aus Text."""
result = self.llm.generate(self.PROMPT.format(text=text[:2000]))
if not result:
return []
entities = []
for e in result.get("entities", []):
name = e.get("name")
etype = e.get("type")
desc = e.get("description")
if isinstance(name, list):
name = name[0] if name else None
if isinstance(etype, list):
etype = etype[0] if etype else None
if isinstance(desc, list):
desc = desc[0] if desc else None
if name and isinstance(name, str) and etype:
entities.append(
Entity(
name=str(name)[:200],
entity_type=self._validate_type(str(etype)),
description=str(desc)[:500] if desc else None,
)
)
return entities[:20]
def _validate_type(self, entity_type: str) -> str:
"""Validiere Entity-Typ."""
valid = {"PERSON", "ORGANIZATION", "CONCEPT", "LOCATION", "OTHER"}
return entity_type.upper() if entity_type.upper() in valid else "OTHER"
class RelationExtractor:
"""Extrahiert Beziehungen zwischen Entitäten."""
PROMPT = """Finde Beziehungen zwischen diesen Entitäten im Text.
Entitäten: {entities}
Beziehungstypen:
- RELATED_TO: steht in Beziehung zu
- PART_OF: ist Teil von
- DEVELOPED_BY: wurde entwickelt von
- USED_IN: wird verwendet in
- INFLUENCED_BY: wurde beeinflusst von
Text:
{text}
Antworte NUR als JSON:
{{
"relations": [
{{"source": "Entity1", "relation": "RELATED_TO", "target": "Entity2", "strength": 0.8}}
]
}}"""
def __init__(self, llm: OllamaService):
self.llm = llm
def extract(self, text: str, entities: list[Entity]) -> list[Relation]:
"""Extrahiere Relationen zwischen Entitäten."""
if len(entities) < 2:
return []
entity_names = ", ".join([e.name for e in entities[:15]])
result = self.llm.generate(self.PROMPT.format(entities=entity_names, text=text[:1500]))
if not result:
return []
relations = []
for r in result.get("relations", []):
source = r.get("source")
target = r.get("target")
rel_type = r.get("relation")
strength = r.get("strength", 0.5)
if isinstance(source, list):
source = source[0] if source else None
if isinstance(target, list):
target = target[0] if target else None
if isinstance(rel_type, list):
rel_type = rel_type[0] if rel_type else None
if source and target and rel_type and isinstance(source, str) and isinstance(target, str):
try:
strength_val = float(strength) if not isinstance(strength, list) else 0.5
except (ValueError, TypeError):
strength_val = 0.5
relations.append(
Relation(
source=str(source)[:200],
relation_type=str(rel_type)[:50],
target=str(target)[:200],
strength=min(1.0, max(0.0, strength_val)),
)
)
return relations[:10]
class TaxonomyClassifier:
"""Klassifiziert Chunks in Taxonomie-Kategorien."""
CATEGORIES = [
"Methoden",
"Theorie",
"Praxis",
"Kommunikation",
"Organisation",
"Entwicklung",
"Coaching",
"Therapie",
]
PROMPT = """Klassifiziere diesen Text in passende Kategorien.
Kategorien: {categories}
Text:
{text}
Antworte NUR als JSON:
{{
"categories": ["Kategorie1", "Kategorie2"],
"confidence": 0.8
}}"""
def __init__(self, llm: OllamaService):
self.llm = llm
def classify(self, text: str) -> tuple[list[str], float]:
"""Klassifiziere Text in Kategorien."""
result = self.llm.generate(self.PROMPT.format(categories=", ".join(self.CATEGORIES), text=text[:1500]))
if not result:
return [], 0.0
categories = [c for c in result.get("categories", []) if c in self.CATEGORIES]
confidence = min(1.0, max(0.0, float(result.get("confidence", 0.5))))
return categories[:3], confidence