{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/semantic_chunk\/analyzers.py",
"content": "\"\"\"\nAnalyzer classes for semantic chunk analysis.\n\"\"\"\n\nfrom .models import ChunkSemantics, Entity, Relation\nfrom .ollama_service import OllamaService\n\n\nclass SemanticsAnalyzer:\n \"\"\"Analysiert Chunk-Semantik: Summary, Keywords, Sentiment.\"\"\"\n\n PROMPT = \"\"\"Analysiere diesen deutschen Text und erstelle eine semantische Analyse.\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"summary\": \"Zusammenfassung in 1-2 Sätzen\",\n \"keywords\": [\"keyword1\", \"keyword2\", \"keyword3\"],\n \"sentiment\": \"positive|neutral|negative|mixed\",\n \"topics\": [\"thema1\", \"thema2\"],\n \"language\": \"de|en\"\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def analyze(self, chunk_id: int, text: str) -> ChunkSemantics | None:\n \"\"\"Analysiere einen Chunk.\"\"\"\n result = self.llm.generate(self.PROMPT.format(text=text[:2000]))\n if not result:\n return None\n\n summary = result.get(\"summary\", \"\")\n if isinstance(summary, list):\n summary = summary[0] if summary else \"\"\n\n keywords = result.get(\"keywords\", [])\n if not isinstance(keywords, list):\n keywords = [str(keywords)] if keywords else []\n keywords = [str(k) for k in keywords if k and not isinstance(k, (list, dict))][:10]\n\n topics = result.get(\"topics\", [])\n if not isinstance(topics, list):\n topics = [str(topics)] if topics else []\n topics = [str(t) for t in topics if t and not isinstance(t, (list, dict))][:5]\n\n language = result.get(\"language\", \"de\")\n if isinstance(language, list):\n language = language[0] if language else \"de\"\n\n return ChunkSemantics(\n chunk_id=chunk_id,\n summary=str(summary)[:1000],\n keywords=keywords,\n sentiment=self._validate_sentiment(result.get(\"sentiment\", \"neutral\")),\n topics=topics,\n language=str(language)[:5],\n )\n\n def _validate_sentiment(self, sentiment) -> str:\n \"\"\"Validiere Sentiment-Wert.\"\"\"\n if isinstance(sentiment, list):\n sentiment = sentiment[0] if sentiment else \"neutral\"\n if not isinstance(sentiment, str):\n return \"neutral\"\n valid = {\"positive\", \"neutral\", \"negative\", \"mixed\"}\n return sentiment.lower() if sentiment.lower() in valid else \"neutral\"\n\n\nclass EntityExtractor:\n \"\"\"Extrahiert Entitäten aus Text.\"\"\"\n\n PROMPT = \"\"\"Extrahiere alle wichtigen Entitäten aus diesem deutschen Text.\n\nKategorien:\n- PERSON: Namen von Personen\n- ORGANIZATION: Firmen, Institutionen\n- CONCEPT: Fachbegriffe, Methoden, Theorien\n- LOCATION: Orte, Länder\n- OTHER: Sonstiges\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"entities\": [\n {{\"name\": \"Name\", \"type\": \"CONCEPT\", \"description\": \"Kurze Beschreibung\"}}\n ]\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def extract(self, text: str) -> list[Entity]:\n \"\"\"Extrahiere Entitäten aus Text.\"\"\"\n result = self.llm.generate(self.PROMPT.format(text=text[:2000]))\n if not result:\n return []\n\n entities = []\n for e in result.get(\"entities\", []):\n name = e.get(\"name\")\n etype = e.get(\"type\")\n desc = e.get(\"description\")\n\n if isinstance(name, list):\n name = name[0] if name else None\n if isinstance(etype, list):\n etype = etype[0] if etype else None\n if isinstance(desc, list):\n desc = desc[0] if desc else None\n\n if name and isinstance(name, str) and etype:\n entities.append(\n Entity(\n name=str(name)[:200],\n entity_type=self._validate_type(str(etype)),\n description=str(desc)[:500] if desc else None,\n )\n )\n return entities[:20]\n\n def _validate_type(self, entity_type: str) -> str:\n \"\"\"Validiere Entity-Typ.\"\"\"\n valid = {\"PERSON\", \"ORGANIZATION\", \"CONCEPT\", \"LOCATION\", \"OTHER\"}\n return entity_type.upper() if entity_type.upper() in valid else \"OTHER\"\n\n\nclass RelationExtractor:\n \"\"\"Extrahiert Beziehungen zwischen Entitäten.\"\"\"\n\n PROMPT = \"\"\"Finde Beziehungen zwischen diesen Entitäten im Text.\n\nEntitäten: {entities}\n\nBeziehungstypen:\n- RELATED_TO: steht in Beziehung zu\n- PART_OF: ist Teil von\n- DEVELOPED_BY: wurde entwickelt von\n- USED_IN: wird verwendet in\n- INFLUENCED_BY: wurde beeinflusst von\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"relations\": [\n {{\"source\": \"Entity1\", \"relation\": \"RELATED_TO\", \"target\": \"Entity2\", \"strength\": 0.8}}\n ]\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def extract(self, text: str, entities: list[Entity]) -> list[Relation]:\n \"\"\"Extrahiere Relationen zwischen Entitäten.\"\"\"\n if len(entities) < 2:\n return []\n\n entity_names = \", \".join([e.name for e in entities[:15]])\n result = self.llm.generate(self.PROMPT.format(entities=entity_names, text=text[:1500]))\n\n if not result:\n return []\n\n relations = []\n for r in result.get(\"relations\", []):\n source = r.get(\"source\")\n target = r.get(\"target\")\n rel_type = r.get(\"relation\")\n strength = r.get(\"strength\", 0.5)\n\n if isinstance(source, list):\n source = source[0] if source else None\n if isinstance(target, list):\n target = target[0] if target else None\n if isinstance(rel_type, list):\n rel_type = rel_type[0] if rel_type else None\n\n if source and target and rel_type and isinstance(source, str) and isinstance(target, str):\n try:\n strength_val = float(strength) if not isinstance(strength, list) else 0.5\n except (ValueError, TypeError):\n strength_val = 0.5\n relations.append(\n Relation(\n source=str(source)[:200],\n relation_type=str(rel_type)[:50],\n target=str(target)[:200],\n strength=min(1.0, max(0.0, strength_val)),\n )\n )\n return relations[:10]\n\n\nclass TaxonomyClassifier:\n \"\"\"Klassifiziert Chunks in Taxonomie-Kategorien.\"\"\"\n\n CATEGORIES = [\n \"Methoden\",\n \"Theorie\",\n \"Praxis\",\n \"Kommunikation\",\n \"Organisation\",\n \"Entwicklung\",\n \"Coaching\",\n \"Therapie\",\n ]\n\n PROMPT = \"\"\"Klassifiziere diesen Text in passende Kategorien.\n\nKategorien: {categories}\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"categories\": [\"Kategorie1\", \"Kategorie2\"],\n \"confidence\": 0.8\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def classify(self, text: str) -> tuple[list[str], float]:\n \"\"\"Klassifiziere Text in Kategorien.\"\"\"\n result = self.llm.generate(self.PROMPT.format(categories=\", \".join(self.CATEGORIES), text=text[:1500]))\n\n if not result:\n return [], 0.0\n\n categories = [c for c in result.get(\"categories\", []) if c in self.CATEGORIES]\n confidence = min(1.0, max(0.0, float(result.get(\"confidence\", 0.5))))\n return categories[:3], confidence\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/scripts\/pipeline\/semantic_chunk\/analyzers.py",
"content": "\"\"\"\nAnalyzer classes for semantic chunk analysis.\n\"\"\"\n\nfrom .models import ChunkSemantics, Entity, Relation\nfrom .ollama_service import OllamaService\n\n\nclass SemanticsAnalyzer:\n \"\"\"Analysiert Chunk-Semantik: Summary, Keywords, Sentiment.\"\"\"\n\n PROMPT = \"\"\"Analysiere diesen deutschen Text und erstelle eine semantische Analyse.\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"summary\": \"Zusammenfassung in 1-2 Sätzen\",\n \"keywords\": [\"keyword1\", \"keyword2\", \"keyword3\"],\n \"sentiment\": \"positive|neutral|negative|mixed\",\n \"topics\": [\"thema1\", \"thema2\"],\n \"language\": \"de|en\"\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def analyze(self, chunk_id: int, text: str) -> ChunkSemantics | None:\n \"\"\"Analysiere einen Chunk.\"\"\"\n result = self.llm.generate(self.PROMPT.format(text=text[:2000]))\n if not result:\n return None\n\n summary = result.get(\"summary\", \"\")\n if isinstance(summary, list):\n summary = summary[0] if summary else \"\"\n\n keywords = result.get(\"keywords\", [])\n if not isinstance(keywords, list):\n keywords = [str(keywords)] if keywords else []\n keywords = [str(k) for k in keywords if k and not isinstance(k, (list, dict))][:10]\n\n topics = result.get(\"topics\", [])\n if not isinstance(topics, list):\n topics = [str(topics)] if topics else []\n topics = [str(t) for t in topics if t and not isinstance(t, (list, dict))][:5]\n\n language = result.get(\"language\", \"de\")\n if isinstance(language, list):\n language = language[0] if language else \"de\"\n\n return ChunkSemantics(\n chunk_id=chunk_id,\n summary=str(summary)[:1000],\n keywords=keywords,\n sentiment=self._validate_sentiment(result.get(\"sentiment\", \"neutral\")),\n topics=topics,\n language=str(language)[:5],\n )\n\n def _validate_sentiment(self, sentiment) -> str:\n \"\"\"Validiere Sentiment-Wert.\"\"\"\n if isinstance(sentiment, list):\n sentiment = sentiment[0] if sentiment else \"neutral\"\n if not isinstance(sentiment, str):\n return \"neutral\"\n valid = {\"positive\", \"neutral\", \"negative\", \"mixed\"}\n return sentiment.lower() if sentiment.lower() in valid else \"neutral\"\n\n\nclass EntityExtractor:\n \"\"\"Extrahiert Entitäten aus Text.\"\"\"\n\n PROMPT = \"\"\"Extrahiere alle wichtigen Entitäten aus diesem deutschen Text.\n\nKategorien:\n- PERSON: Namen von Personen\n- ORGANIZATION: Firmen, Institutionen\n- CONCEPT: Fachbegriffe, Methoden, Theorien\n- LOCATION: Orte, Länder\n- OTHER: Sonstiges\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"entities\": [\n {{\"name\": \"Name\", \"type\": \"CONCEPT\", \"description\": \"Kurze Beschreibung\"}}\n ]\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def extract(self, text: str) -> list[Entity]:\n \"\"\"Extrahiere Entitäten aus Text.\"\"\"\n result = self.llm.generate(self.PROMPT.format(text=text[:2000]))\n if not result:\n return []\n\n entities = []\n for e in result.get(\"entities\", []):\n name = e.get(\"name\")\n etype = e.get(\"type\")\n desc = e.get(\"description\")\n\n if isinstance(name, list):\n name = name[0] if name else None\n if isinstance(etype, list):\n etype = etype[0] if etype else None\n if isinstance(desc, list):\n desc = desc[0] if desc else None\n\n if name and isinstance(name, str) and etype:\n entities.append(\n Entity(\n name=str(name)[:200],\n entity_type=self._validate_type(str(etype)),\n description=str(desc)[:500] if desc else None,\n )\n )\n return entities[:20]\n\n def _validate_type(self, entity_type: str) -> str:\n \"\"\"Validiere Entity-Typ.\"\"\"\n valid = {\"PERSON\", \"ORGANIZATION\", \"CONCEPT\", \"LOCATION\", \"OTHER\"}\n return entity_type.upper() if entity_type.upper() in valid else \"OTHER\"\n\n\nclass RelationExtractor:\n \"\"\"Extrahiert Beziehungen zwischen Entitäten.\"\"\"\n\n PROMPT = \"\"\"Finde Beziehungen zwischen diesen Entitäten im Text.\n\nEntitäten: {entities}\n\nBeziehungstypen:\n- RELATED_TO: steht in Beziehung zu\n- PART_OF: ist Teil von\n- DEVELOPED_BY: wurde entwickelt von\n- USED_IN: wird verwendet in\n- INFLUENCED_BY: wurde beeinflusst von\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"relations\": [\n {{\"source\": \"Entity1\", \"relation\": \"RELATED_TO\", \"target\": \"Entity2\", \"strength\": 0.8}}\n ]\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def extract(self, text: str, entities: list[Entity]) -> list[Relation]:\n \"\"\"Extrahiere Relationen zwischen Entitäten.\"\"\"\n if len(entities) < 2:\n return []\n\n entity_names = \", \".join([e.name for e in entities[:15]])\n result = self.llm.generate(self.PROMPT.format(entities=entity_names, text=text[:1500]))\n\n if not result:\n return []\n\n relations = []\n for r in result.get(\"relations\", []):\n source = r.get(\"source\")\n target = r.get(\"target\")\n rel_type = r.get(\"relation\")\n strength = r.get(\"strength\", 0.5)\n\n if isinstance(source, list):\n source = source[0] if source else None\n if isinstance(target, list):\n target = target[0] if target else None\n if isinstance(rel_type, list):\n rel_type = rel_type[0] if rel_type else None\n\n if source and target and rel_type and isinstance(source, str) and isinstance(target, str):\n try:\n strength_val = float(strength) if not isinstance(strength, list) else 0.5\n except (ValueError, TypeError):\n strength_val = 0.5\n relations.append(\n Relation(\n source=str(source)[:200],\n relation_type=str(rel_type)[:50],\n target=str(target)[:200],\n strength=min(1.0, max(0.0, strength_val)),\n )\n )\n return relations[:10]\n\n\nclass TaxonomyClassifier:\n \"\"\"Klassifiziert Chunks in Taxonomie-Kategorien.\"\"\"\n\n CATEGORIES = [\n \"Methoden\",\n \"Theorie\",\n \"Praxis\",\n \"Kommunikation\",\n \"Organisation\",\n \"Entwicklung\",\n \"Coaching\",\n \"Therapie\",\n ]\n\n PROMPT = \"\"\"Klassifiziere diesen Text in passende Kategorien.\n\nKategorien: {categories}\n\nText:\n{text}\n\nAntworte NUR als JSON:\n{{\n \"categories\": [\"Kategorie1\", \"Kategorie2\"],\n \"confidence\": 0.8\n}}\"\"\"\n\n def __init__(self, llm: OllamaService):\n self.llm = llm\n\n def classify(self, text: str) -> tuple[list[str], float]:\n \"\"\"Klassifiziere Text in Kategorien.\"\"\"\n result = self.llm.generate(self.PROMPT.format(categories=\", \".join(self.CATEGORIES), text=text[:1500]))\n\n if not result:\n return [], 0.0\n\n categories = [c for c in result.get(\"categories\", []) if c in self.CATEGORIES]\n confidence = min(1.0, max(0.0, float(result.get(\"confidence\", 0.5))))\n return categories[:3], confidence\n",
"structuredPatch": [],
"originalFile": null
}
}