{
"tool_response": {
"type": "text",
"file": {
"filePath": "\/var\/www\/scripts\/pipeline\/semantic_chunk_analyzer.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"\nSemantic Chunk Analyzer - Chunk-Level Semantic Analysis Pipeline\n\nAnalysiert Chunks aus der ki_system.chunks Tabelle und befüllt:\n- chunk_semantics (summary, keywords, sentiment, topics, language)\n- entities (extrahierte Entitäten)\n- chunk_entities (Verknüpfung Chunk <-> Entity)\n- entity_relations (Beziehungen zwischen Entitäten)\n- taxonomy_terms + chunk_taxonomy (Kategorisierung)\n\nPrinzipien: DRY, KISS, SRP, YAGNI\n- Eine Klasse pro Verantwortlichkeit\n- Ollama für LLM-Aufrufe (lokal, kostenfrei)\n- Batch-Verarbeitung für Effizienz\n\nUsage:\n python semantic_chunk_analyzer.py analyze [--limit N]\n python semantic_chunk_analyzer.py status\n python semantic_chunk_analyzer.py reset\n\"\"\"\n\nimport json\nimport sys\nfrom dataclasses import dataclass\n\nimport requests\n\nfrom config import OLLAMA_HOST\nfrom db import db\n\n# === Configuration ===\nANALYSIS_MODEL = \"gemma3:27b-it-qat\" # Beste JSON-Compliance und Qualität\nBATCH_SIZE = 10\n\n\n# === Data Classes (SRP) ===\n@dataclass\nclass ChunkSemantics:\n \"\"\"Semantische Analyse eines Chunks.\"\"\"\n\n chunk_id: int\n summary: str\n keywords: list[str]\n sentiment: str # positive, neutral, negative, mixed\n topics: list[str]\n language: str\n\n\n@dataclass\nclass Entity:\n \"\"\"Extrahierte Entität.\"\"\"\n\n name: str\n entity_type: str # PERSON, ORGANIZATION, CONCEPT, LOCATION, OTHER\n description: str | None = None\n\n\n@dataclass\nclass Relation:\n \"\"\"Beziehung zwischen Entitäten.\"\"\"\n\n source: str\n relation_type: str\n target: str\n strength: float = 0.5\n\n\n# === LLM Service (SRP) ===\nclass OllamaService:\n \"\"\"Ollama API Wrapper - Single Responsibility: LLM Kommunikation.\"\"\"\n\n def __init__(self, host: str = OLLAMA_HOST, model: str = ANALYSIS_MODEL):\n self.host = host\n self.model = model\n\n def generate(self, prompt: str, json_format: bool = True) -> dict | None:\n \"\"\"Generiere Antwort von Ollama.\"\"\"\n try:\n payload = {\n \"model\": self.model,\n \"prompt\": prompt,\n \"stream\": False,\n \"options\": {\"temperature\": 0.3, \"num_predict\": 1000},\n }\n if json_format:\n payload[\"format\"] = \"json\"\n\n response = requests.post(f\"{self.host}\/api\/generate\", json=payload, timeout=120)\n response.raise_for_status()\n\n text = response.json().get(\"response\", \"{}\")\n if json_format:\n return self._parse_json(text)\n return {\"text\": text}\n except Exception as e:\n db.log(\"ERROR\", f\"Ollama error: {e}\")\n return None\n\n def _parse_json(self, text: str) -> dict | None:\n \"\"\"Parse JSON aus Antwort.\"\"\"\n try:\n return json.loads(text)\n except json.JSONDecodeError:\n # Versuche JSON aus Text zu extrahieren\n import re\n\n match = re.search(r\"\\{[\\s\\S]*\\}\", text)\n if match:\n try:\n return json.loads(match.group())\n except json.JSONDecodeError:\n pass\n return None\n\n\n# === Analyzer Classes (SRP) ===\nclass SemanticsAnalyzer:\n \"\"\"Analysiert Chunk-Semantik: Summary, Keywords, Sentiment.\"\"\"\n\n PROMPT = \"\"\"Analysiere diesen deutschen Text und erstelle eine semantische Analyse.\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"summary\": \"Zusammenfassung in 1-2 Sätzen\",\n \"keywords\": [\"keyword1\", \"keyword2\", \"keyword3\"],\n \"sentiment\": \"positive|neutral|negative|mixed\",\n \"topics\": [\"thema1\", \"thema2\"],\n \"language\": \"de|en\"\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def analyze(self, chunk_id: int, text: str) -> ChunkSemantics | None:\n \"\"\"Analysiere einen Chunk.\"\"\"\n result = self.llm.generate(self.PROMPT.format(text=text[:2000]))\n if not result:\n return None\n\n # Robuste Extraktion mit Typ-Validierung\n summary = result.get(\"summary\", \"\")\n if isinstance(summary, list):\n summary = summary[0] if summary else \"\"\n\n keywords = result.get(\"keywords\", [])\n if not isinstance(keywords, list):\n keywords = [str(keywords)] if keywords else []\n keywords = [str(k) for k in keywords if k and not isinstance(k, (list, dict))][:10]\n\n topics = result.get(\"topics\", [])\n if not isinstance(topics, list):\n topics = [str(topics)] if topics else []\n topics = [str(t) for t in topics if t and not isinstance(t, (list, dict))][:5]\n\n language = result.get(\"language\", \"de\")\n if isinstance(language, list):\n language = language[0] if language else \"de\"\n\n return ChunkSemantics(\n chunk_id=chunk_id,\n summary=str(summary)[:1000],\n keywords=keywords,\n sentiment=self._validate_sentiment(result.get(\"sentiment\", \"neutral\")),\n topics=topics,\n language=str(language)[:5],\n )\n\n def _validate_sentiment(self, sentiment) -> str:\n \"\"\"Validiere Sentiment-Wert.\"\"\"\n if isinstance(sentiment, list):\n sentiment = sentiment[0] if sentiment else \"neutral\"\n if not isinstance(sentiment, str):\n return \"neutral\"\n valid = {\"positive\", \"neutral\", \"negative\", \"mixed\"}\n return sentiment.lower() if sentiment.lower() in valid else \"neutral\"\n\n\nclass EntityExtractor:\n \"\"\"Extrahiert Entitäten aus Text.\"\"\"\n\n PROMPT = \"\"\"Extrahiere alle wichtigen Entitäten aus diesem deutschen Text.\n\nKategorien:\n- PERSON: Namen von Personen\n- ORGANIZATION: Firmen, Institutionen\n- CONCEPT: Fachbegriffe, Methoden, Theorien\n- LOCATION: Orte, Länder\n- OTHER: Sonstiges\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"entities\": [\n {{\"name\": \"Name\", \"type\": \"CONCEPT\", \"description\": \"Kurze Beschreibung\"}}\n ]\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def extract(self, text: str) -> list[Entity]:\n \"\"\"Extrahiere Entitäten aus Text.\"\"\"\n result = self.llm.generate(self.PROMPT.format(text=text[:2000]))\n if not result:\n return []\n\n entities = []\n for e in result.get(\"entities\", []):\n name = e.get(\"name\")\n etype = e.get(\"type\")\n desc = e.get(\"description\")\n\n # Validiere: name und type müssen Strings sein\n if isinstance(name, list):\n name = name[0] if name else None\n if isinstance(etype, list):\n etype = etype[0] if etype else None\n if isinstance(desc, list):\n desc = desc[0] if desc else None\n\n if name and isinstance(name, str) and etype:\n entities.append(\n Entity(\n name=str(name)[:200], # Limit length\n entity_type=self._validate_type(str(etype)),\n description=str(desc)[:500] if desc else None,\n )\n )\n return entities[:20] # Max 20 pro Chunk\n\n def _validate_type(self, entity_type: str) -> str:\n \"\"\"Validiere Entity-Typ.\"\"\"\n valid = {\"PERSON\", \"ORGANIZATION\", \"CONCEPT\", \"LOCATION\", \"OTHER\"}\n return entity_type.upper() if entity_type.upper() in valid else \"OTHER\"\n\n\nclass RelationExtractor:\n \"\"\"Extrahiert Beziehungen zwischen Entitäten.\"\"\"\n\n PROMPT = \"\"\"Finde Beziehungen zwischen diesen Entitäten im Text.\n\nEntitäten: {entities}\n\nBeziehungstypen:\n- RELATED_TO: steht in Beziehung zu\n- PART_OF: ist Teil von\n- DEVELOPED_BY: wurde entwickelt von\n- USED_IN: wird verwendet in\n- INFLUENCED_BY: wurde beeinflusst von\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"relations\": [\n {{\"source\": \"Entity1\", \"relation\": \"RELATED_TO\", \"target\": \"Entity2\", \"strength\": 0.8}}\n ]\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def extract(self, text: str, entities: list[Entity]) -> list[Relation]:\n \"\"\"Extrahiere Relationen zwischen Entitäten.\"\"\"\n if len(entities) < 2:\n return []\n\n entity_names = \", \".join([e.name for e in entities[:15]])\n result = self.llm.generate(self.PROMPT.format(entities=entity_names, text=text[:1500]))\n\n if not result:\n return []\n\n relations = []\n for r in result.get(\"relations\", []):\n source = r.get(\"source\")\n target = r.get(\"target\")\n rel_type = r.get(\"relation\")\n strength = r.get(\"strength\", 0.5)\n\n # Validiere Typen\n if isinstance(source, list):\n source = source[0] if source else None\n if isinstance(target, list):\n target = target[0] if target else None\n if isinstance(rel_type, list):\n rel_type = rel_type[0] if rel_type else None\n\n if source and target and rel_type and isinstance(source, str) and isinstance(target, str):\n try:\n strength_val = float(strength) if not isinstance(strength, list) else 0.5\n except (ValueError, TypeError):\n strength_val = 0.5\n relations.append(\n Relation(\n source=str(source)[:200],\n relation_type=str(rel_type)[:50],\n target=str(target)[:200],\n strength=min(1.0, max(0.0, strength_val)),\n )\n )\n return relations[:10] # Max 10 pro Chunk\n\n\nclass TaxonomyClassifier:\n \"\"\"Klassifiziert Chunks in Taxonomie-Kategorien.\"\"\"\n\n # Standard-Taxonomie für systemische Inhalte\n CATEGORIES = [\n \"Methoden\",\n \"Theorie\",\n \"Praxis\",\n \"Kommunikation\",\n \"Organisation\",\n \"Entwicklung\",\n \"Coaching\",\n \"Therapie\",\n ]\n\n PROMPT = \"\"\"Klassifiziere diesen Text in passende Kategorien.\n\nKategorien: {categories}\n\nText:\n{text}\n\nAntworte NUR al... [TRUNCATED-a0b3b9e3fdc8ca42]",
"numLines": 678,
"startLine": 1,
"totalLines": 678
}
}
}