{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/knowledge\/taxonomy_extractor.py",
"content": "\"\"\"Taxonomie-Extraktion für Wissensextraktion.\"\"\"\nimport re\n\nimport sys\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\n\nfrom .models import KnowledgeLevel, KnowledgeType\nfrom .llm_service import LLMService\n\n\nclass TaxonomyExtractor:\n \"\"\"Extrahiert Taxonomie (hierarchische Einordnung) für Entitäten.\"\"\"\n\n def __init__(self, llm_service: LLMService, store_knowledge_fn):\n \"\"\"Initialisiere mit LLM-Service und Storage-Funktion.\"\"\"\n self.llm = llm_service\n self.store_knowledge = store_knowledge_fn\n\n def extract_taxonomy(\n self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int\n ) -> list[dict]:\n \"\"\"\n Extrahiere Taxonomie (hierarchische Einordnung) für Entitäten.\n\n Args:\n entities: Liste der Entitäten\n text: Ursprungstext\n level: Ebene\n source_id: Quell-ID\n\n Returns:\n Liste von Taxonomie-Zuordnungen\n \"\"\"\n if not entities:\n return []\n\n # Hole existierende Taxonomie-Terme\n cursor = db.execute(\"SELECT id, name, path, depth FROM taxonomy_terms ORDER BY depth, name\")\n existing_terms = cursor.fetchall()\n cursor.close()\n\n term_names = [t[\"name\"] for t in existing_terms]\n entity_names = [e[\"name\"] for e in entities[:15]]\n\n prompt = f\"\"\"Ordne die folgenden Entitäten in eine hierarchische Taxonomie ein.\n\nEntitäten: {\", \".join(entity_names)}\n\nExistierende Taxonomie-Kategorien: {\", \".join(term_names) if term_names else \"Keine vorhanden\"}\n\nAufgabe:\n1. Ordne jede Entität einer passenden Kategorie zu\n2. Wenn keine passende Kategorie existiert, schlage eine neue vor\n3. Gib die hierarchische Einordnung an\n\nAntworte NUR als JSON:\n{{\"mappings\": [\n {{\"entity\": \"...\", \"category\": \"...\", \"parent_category\": null, \"confidence\": 0.0-1.0, \"is_new_category\": false}}\n]}}\n\nText-Kontext:\n{text[:2000]}\"\"\"\n\n result = self.llm.call_llm(prompt)\n data = self.llm.parse_json(result)\n mappings = data.get(\"mappings\", [])\n\n # Speichere Taxonomie-Zuordnungen\n stored = []\n for mapping in mappings:\n entity_match = next(\n (e for e in entities if e[\"name\"].lower() == mapping.get(\"entity\", \"\").lower()), None\n )\n if entity_match:\n stored_mapping = self._store_taxonomy_mapping(\n entity_id=entity_match[\"id\"],\n category_name=mapping.get(\"category\", \"\"),\n parent_category=mapping.get(\"parent_category\"),\n confidence=mapping.get(\"confidence\", 0.8),\n is_new=mapping.get(\"is_new_category\", False),\n existing_terms=existing_terms,\n level=level,\n source_id=source_id,\n )\n if stored_mapping:\n stored.append(stored_mapping)\n\n # Speichere in Knowledge-Tabelle\n self.store_knowledge(\n level,\n source_id,\n KnowledgeType.TAXONOMY,\n {\"mappings\": len(stored), \"categories\": list({m[\"category\"] for m in stored})},\n )\n\n return stored\n\n def _store_taxonomy_mapping(\n self,\n entity_id: int,\n category_name: str,\n parent_category: str | None,\n confidence: float,\n is_new: bool,\n existing_terms: list,\n level: KnowledgeLevel,\n source_id: int,\n ) -> dict | None:\n \"\"\"Speichere Taxonomie-Zuordnung.\"\"\"\n try:\n # Finde oder erstelle Taxonomie-Term\n term = next((t for t in existing_terms if t[\"name\"].lower() == category_name.lower()), None)\n\n if term:\n term_id = term[\"id\"]\n elif is_new:\n # Neuen Term anlegen\n parent_id = None\n depth = 0\n path = f\"\/{category_name}\"\n\n if parent_category:\n parent_term = next(\n (t for t in existing_terms if t[\"name\"].lower() == parent_category.lower()), None\n )\n if parent_term:\n parent_id = parent_term[\"id\"]\n depth = parent_term[\"depth\"] + 1\n path = f\"{parent_term['path']}\/{category_name}\"\n\n # Erstelle Slug\n slug = re.sub(r\"[^a-z0-9]+\", \"-\", category_name.lower()).strip(\"-\")\n\n cursor = db.execute(\n \"\"\"INSERT INTO taxonomy_terms (name, slug, parent_id, depth, path, created_at)\n VALUES (%s, %s, %s, %s, %s, NOW())\"\"\",\n (category_name, slug, parent_id, depth, path),\n )\n db.commit()\n term_id = cursor.lastrowid\n cursor.close()\n db.log(\"INFO\", f\"Neuer Taxonomie-Term: '{category_name}' (ID: {term_id})\")\n else:\n return None\n\n model_name = f\"{self.llm.model.provider}:{self.llm.model.model_name}\"\n\n # Speichere Zuordnung\n cursor = db.execute(\n \"\"\"INSERT INTO entity_taxonomy_mapping\n (entity_id, taxonomy_term_id, confidence, source_type, source_id, model_used, created_at)\n VALUES (%s, %s, %s, %s, %s, %s, NOW())\n ON DUPLICATE KEY UPDATE confidence = VALUES(confidence)\"\"\",\n (entity_id, term_id, confidence, level.value, source_id, model_name),\n )\n db.commit()\n cursor.close()\n\n return {\"entity_id\": entity_id, \"term_id\": term_id, \"category\": category_name, \"confidence\": confidence}\n\n except Exception as e:\n db.log(\"ERROR\", f\"Fehler beim Speichern der Taxonomie: {e}\")\n return None\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/scripts\/pipeline\/knowledge\/taxonomy_extractor.py",
"content": "\"\"\"Taxonomie-Extraktion für Wissensextraktion.\"\"\"\nimport re\n\nimport sys\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\n\nfrom .models import KnowledgeLevel, KnowledgeType\nfrom .llm_service import LLMService\n\n\nclass TaxonomyExtractor:\n \"\"\"Extrahiert Taxonomie (hierarchische Einordnung) für Entitäten.\"\"\"\n\n def __init__(self, llm_service: LLMService, store_knowledge_fn):\n \"\"\"Initialisiere mit LLM-Service und Storage-Funktion.\"\"\"\n self.llm = llm_service\n self.store_knowledge = store_knowledge_fn\n\n def extract_taxonomy(\n self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int\n ) -> list[dict]:\n \"\"\"\n Extrahiere Taxonomie (hierarchische Einordnung) für Entitäten.\n\n Args:\n entities: Liste der Entitäten\n text: Ursprungstext\n level: Ebene\n source_id: Quell-ID\n\n Returns:\n Liste von Taxonomie-Zuordnungen\n \"\"\"\n if not entities:\n return []\n\n # Hole existierende Taxonomie-Terme\n cursor = db.execute(\"SELECT id, name, path, depth FROM taxonomy_terms ORDER BY depth, name\")\n existing_terms = cursor.fetchall()\n cursor.close()\n\n term_names = [t[\"name\"] for t in existing_terms]\n entity_names = [e[\"name\"] for e in entities[:15]]\n\n prompt = f\"\"\"Ordne die folgenden Entitäten in eine hierarchische Taxonomie ein.\n\nEntitäten: {\", \".join(entity_names)}\n\nExistierende Taxonomie-Kategorien: {\", \".join(term_names) if term_names else \"Keine vorhanden\"}\n\nAufgabe:\n1. Ordne jede Entität einer passenden Kategorie zu\n2. Wenn keine passende Kategorie existiert, schlage eine neue vor\n3. Gib die hierarchische Einordnung an\n\nAntworte NUR als JSON:\n{{\"mappings\": [\n {{\"entity\": \"...\", \"category\": \"...\", \"parent_category\": null, \"confidence\": 0.0-1.0, \"is_new_category\": false}}\n]}}\n\nText-Kontext:\n{text[:2000]}\"\"\"\n\n result = self.llm.call_llm(prompt)\n data = self.llm.parse_json(result)\n mappings = data.get(\"mappings\", [])\n\n # Speichere Taxonomie-Zuordnungen\n stored = []\n for mapping in mappings:\n entity_match = next(\n (e for e in entities if e[\"name\"].lower() == mapping.get(\"entity\", \"\").lower()), None\n )\n if entity_match:\n stored_mapping = self._store_taxonomy_mapping(\n entity_id=entity_match[\"id\"],\n category_name=mapping.get(\"category\", \"\"),\n parent_category=mapping.get(\"parent_category\"),\n confidence=mapping.get(\"confidence\", 0.8),\n is_new=mapping.get(\"is_new_category\", False),\n existing_terms=existing_terms,\n level=level,\n source_id=source_id,\n )\n if stored_mapping:\n stored.append(stored_mapping)\n\n # Speichere in Knowledge-Tabelle\n self.store_knowledge(\n level,\n source_id,\n KnowledgeType.TAXONOMY,\n {\"mappings\": len(stored), \"categories\": list({m[\"category\"] for m in stored})},\n )\n\n return stored\n\n def _store_taxonomy_mapping(\n self,\n entity_id: int,\n category_name: str,\n parent_category: str | None,\n confidence: float,\n is_new: bool,\n existing_terms: list,\n level: KnowledgeLevel,\n source_id: int,\n ) -> dict | None:\n \"\"\"Speichere Taxonomie-Zuordnung.\"\"\"\n try:\n # Finde oder erstelle Taxonomie-Term\n term = next((t for t in existing_terms if t[\"name\"].lower() == category_name.lower()), None)\n\n if term:\n term_id = term[\"id\"]\n elif is_new:\n # Neuen Term anlegen\n parent_id = None\n depth = 0\n path = f\"\/{category_name}\"\n\n if parent_category:\n parent_term = next(\n (t for t in existing_terms if t[\"name\"].lower() == parent_category.lower()), None\n )\n if parent_term:\n parent_id = parent_term[\"id\"]\n depth = parent_term[\"depth\"] + 1\n path = f\"{parent_term['path']}\/{category_name}\"\n\n # Erstelle Slug\n slug = re.sub(r\"[^a-z0-9]+\", \"-\", category_name.lower()).strip(\"-\")\n\n cursor = db.execute(\n \"\"\"INSERT INTO taxonomy_terms (name, slug, parent_id, depth, path, created_at)\n VALUES (%s, %s, %s, %s, %s, NOW())\"\"\",\n (category_name, slug, parent_id, depth, path),\n )\n db.commit()\n term_id = cursor.lastrowid\n cursor.close()\n db.log(\"INFO\", f\"Neuer Taxonomie-Term: '{category_name}' (ID: {term_id})\")\n else:\n return None\n\n model_name = f\"{self.llm.model.provider}:{self.llm.model.model_name}\"\n\n # Speichere Zuordnung\n cursor = db.execute(\n \"\"\"INSERT INTO entity_taxonomy_mapping\n (entity_id, taxonomy_term_id, confidence, source_type, source_id, model_used, created_at)\n VALUES (%s, %s, %s, %s, %s, %s, NOW())\n ON DUPLICATE KEY UPDATE confidence = VALUES(confidence)\"\"\",\n (entity_id, term_id, confidence, level.value, source_id, model_name),\n )\n db.commit()\n cursor.close()\n\n return {\"entity_id\": entity_id, \"term_id\": term_id, \"category\": category_name, \"confidence\": confidence}\n\n except Exception as e:\n db.log(\"ERROR\", f\"Fehler beim Speichern der Taxonomie: {e}\")\n return None\n",
"structuredPatch": [],
"originalFile": null
}
}