analyzers.py

Pfad: /var/www/scripts/pipeline/semantic_chunk/analyzers.py
Namespace: pipeline
Zeilen: 240 | Größe: 7,388 Bytes
Geändert: 2025-12-25 14:04:59 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 96

Dependencies: 100 (25%)
LOC: 86 (20%)
Methods: 100 (20%)
Secrets: 100 (15%)
Classes: 100 (10%)
Magic Numbers: 90 (10%)
Issues 1

Zeile	Typ	Beschreibung
55	magic_number	Magic Number gefunden: 1000
Dependencies 4

use models.ChunkSemantics
use models.Entity
use models.Relation
use ollama_service.OllamaService
Klassen 4

SemanticsAnalyzer class Zeile 9
EntityExtractor class Zeile 72
RelationExtractor class Zeile 132
TaxonomyClassifier class Zeile 200
Code

"""
Analyzer classes for semantic chunk analysis.
"""

from .models import ChunkSemantics, Entity, Relation
from .ollama_service import OllamaService


class SemanticsAnalyzer:
    """Analysiert Chunk-Semantik: Summary, Keywords, Sentiment."""

    PROMPT = """Analysiere diesen deutschen Text und erstelle eine semantische Analyse.

Text:
{text}

Antworte NUR als JSON:
{{
    "summary": "Zusammenfassung in 1-2 Sätzen",
    "keywords": ["keyword1", "keyword2", "keyword3"],
    "sentiment": "positive|neutral|negative|mixed",
    "topics": ["thema1", "thema2"],
    "language": "de|en"
}}"""

    def __init__(self, llm: OllamaService):
        self.llm = llm

    def analyze(self, chunk_id: int, text: str) -> ChunkSemantics | None:
        """Analysiere einen Chunk."""
        result = self.llm.generate(self.PROMPT.format(text=text[:2000]))
        if not result:
            return None

        summary = result.get("summary", "")
        if isinstance(summary, list):
            summary = summary[0] if summary else ""

        keywords = result.get("keywords", [])
        if not isinstance(keywords, list):
            keywords = [str(keywords)] if keywords else []
        keywords = [str(k) for k in keywords if k and not isinstance(k, (list, dict))][:10]

        topics = result.get("topics", [])
        if not isinstance(topics, list):
            topics = [str(topics)] if topics else []
        topics = [str(t) for t in topics if t and not isinstance(t, (list, dict))][:5]

        language = result.get("language", "de")
        if isinstance(language, list):
            language = language[0] if language else "de"

        return ChunkSemantics(
            chunk_id=chunk_id,
            summary=str(summary)[:1000],
            keywords=keywords,
            sentiment=self._validate_sentiment(result.get("sentiment", "neutral")),
            topics=topics,
            language=str(language)[:5],
        )

    def _validate_sentiment(self, sentiment) -> str:
        """Validiere Sentiment-Wert."""
        if isinstance(sentiment, list):
            sentiment = sentiment[0] if sentiment else "neutral"
        if not isinstance(sentiment, str):
            return "neutral"
        valid = {"positive", "neutral", "negative", "mixed"}
        return sentiment.lower() if sentiment.lower() in valid else "neutral"


class EntityExtractor:
    """Extrahiert Entitäten aus Text."""

    PROMPT = """Extrahiere alle wichtigen Entitäten aus diesem deutschen Text.

Kategorien:
- PERSON: Namen von Personen
- ORGANIZATION: Firmen, Institutionen
- CONCEPT: Fachbegriffe, Methoden, Theorien
- LOCATION: Orte, Länder
- OTHER: Sonstiges

Text:
{text}

Antworte NUR als JSON:
{{
    "entities": [
        {{"name": "Name", "type": "CONCEPT", "description": "Kurze Beschreibung"}}
    ]
}}"""

    def __init__(self, llm: OllamaService):
        self.llm = llm

    def extract(self, text: str) -> list[Entity]:
        """Extrahiere Entitäten aus Text."""
        result = self.llm.generate(self.PROMPT.format(text=text[:2000]))
        if not result:
            return []

        entities = []
        for e in result.get("entities", []):
            name = e.get("name")
            etype = e.get("type")
            desc = e.get("description")

            if isinstance(name, list):
                name = name[0] if name else None
            if isinstance(etype, list):
                etype = etype[0] if etype else None
            if isinstance(desc, list):
                desc = desc[0] if desc else None

            if name and isinstance(name, str) and etype:
                entities.append(
                    Entity(
                        name=str(name)[:200],
                        entity_type=self._validate_type(str(etype)),
                        description=str(desc)[:500] if desc else None,
                    )
                )
        return entities[:20]

    def _validate_type(self, entity_type: str) -> str:
        """Validiere Entity-Typ."""
        valid = {"PERSON", "ORGANIZATION", "CONCEPT", "LOCATION", "OTHER"}
        return entity_type.upper() if entity_type.upper() in valid else "OTHER"


class RelationExtractor:
    """Extrahiert Beziehungen zwischen Entitäten."""

    PROMPT = """Finde Beziehungen zwischen diesen Entitäten im Text.

Entitäten: {entities}

Beziehungstypen:
- RELATED_TO: steht in Beziehung zu
- PART_OF: ist Teil von
- DEVELOPED_BY: wurde entwickelt von
- USED_IN: wird verwendet in
- INFLUENCED_BY: wurde beeinflusst von

Text:
{text}

Antworte NUR als JSON:
{{
    "relations": [
        {{"source": "Entity1", "relation": "RELATED_TO", "target": "Entity2", "strength": 0.8}}
    ]
}}"""

    def __init__(self, llm: OllamaService):
        self.llm = llm

    def extract(self, text: str, entities: list[Entity]) -> list[Relation]:
        """Extrahiere Relationen zwischen Entitäten."""
        if len(entities) < 2:
            return []

        entity_names = ", ".join([e.name for e in entities[:15]])
        result = self.llm.generate(self.PROMPT.format(entities=entity_names, text=text[:1500]))

        if not result:
            return []

        relations = []
        for r in result.get("relations", []):
            source = r.get("source")
            target = r.get("target")
            rel_type = r.get("relation")
            strength = r.get("strength", 0.5)

            if isinstance(source, list):
                source = source[0] if source else None
            if isinstance(target, list):
                target = target[0] if target else None
            if isinstance(rel_type, list):
                rel_type = rel_type[0] if rel_type else None

            if source and target and rel_type and isinstance(source, str) and isinstance(target, str):
                try:
                    strength_val = float(strength) if not isinstance(strength, list) else 0.5
                except (ValueError, TypeError):
                    strength_val = 0.5
                relations.append(
                    Relation(
                        source=str(source)[:200],
                        relation_type=str(rel_type)[:50],
                        target=str(target)[:200],
                        strength=min(1.0, max(0.0, strength_val)),
                    )
                )
        return relations[:10]


class TaxonomyClassifier:
    """Klassifiziert Chunks in Taxonomie-Kategorien."""

    CATEGORIES = [
        "Methoden",
        "Theorie",
        "Praxis",
        "Kommunikation",
        "Organisation",
        "Entwicklung",
        "Coaching",
        "Therapie",
    ]

    PROMPT = """Klassifiziere diesen Text in passende Kategorien.

Kategorien: {categories}

Text:
{text}

Antworte NUR als JSON:
{{
    "categories": ["Kategorie1", "Kategorie2"],
    "confidence": 0.8
}}"""

    def __init__(self, llm: OllamaService):
        self.llm = llm

    def classify(self, text: str) -> tuple[list[str], float]:
        """Klassifiziere Text in Kategorien."""
        result = self.llm.generate(self.PROMPT.format(categories=", ".join(self.CATEGORIES), text=text[:1500]))

        if not result:
            return [], 0.0

        categories = [c for c in result.get("categories", []) if c in self.CATEGORIES]
        confidence = min(1.0, max(0.0, float(result.get("confidence", 0.5))))
        return categories[:3], confidence
← Übersicht Graph