analyzer.py
- Pfad:
/var/www/scripts/pipeline/knowledge/analyzer.py - Namespace: pipeline
- Zeilen: 117 | Größe: 5,245 Bytes
- Geändert: 2025-12-25 15:50:37 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 85
- Dependencies: 40 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 11
- use sys
- use db.db
- use entity_extractor.EntityExtractor
- use llm_service.LLMService
- use models.DEFAULT_MODELS
- use models.KnowledgeLevel
- use models.ModelConfig
- use ontology_extractor.OntologyExtractor
- use semantic_extractor.SemanticExtractor
- use storage.KnowledgeStorage
- use taxonomy_extractor.TaxonomyExtractor
Klassen 1
-
KnowledgeExtractorclass Zeile 18
Code
"""Hauptanalyzer für Wissensextraktion."""
import sys
sys.path.insert(0, "/var/www/scripts/pipeline")
from db import db
from .entity_extractor import EntityExtractor
from .llm_service import LLMService
from .models import DEFAULT_MODELS, KnowledgeLevel, ModelConfig
from .ontology_extractor import OntologyExtractor
from .semantic_extractor import SemanticExtractor
from .storage import KnowledgeStorage
from .taxonomy_extractor import TaxonomyExtractor
class KnowledgeExtractor:
"""
Modulare Wissensextraktion mit Datenbankabgleich.
Verwendung:
extractor = KnowledgeExtractor(model_config)
# Pro Seite
entities = extractor.extract_entities(text, KnowledgeLevel.PAGE, page_id)
semantics = extractor.extract_semantics(entities, text, KnowledgeLevel.PAGE, page_id)
ontology = extractor.extract_ontology(entities, text, KnowledgeLevel.PAGE, page_id)
taxonomy = extractor.extract_taxonomy(entities, text, KnowledgeLevel.PAGE, page_id)
"""
def __init__(self, model_config: ModelConfig | None = None):
"""Initialisiere Extractor mit Modellkonfiguration."""
self.model = model_config or DEFAULT_MODELS["ollama"]
self.llm = LLMService(self.model)
# Storage mit Modell-Kontext
model_name = f"{self.model.provider}:{self.model.model_name}"
self.storage = KnowledgeStorage(model_name)
# Initialisiere Extraktoren
self.entity_extractor = EntityExtractor(self.llm, self.storage.store)
self.semantic_extractor = SemanticExtractor(self.llm, self.storage.store)
self.ontology_extractor = OntologyExtractor(self.llm, self.storage.store)
self.taxonomy_extractor = TaxonomyExtractor(self.llm, self.storage.store)
def extract_entities(self, text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
"""Extrahiere Entitäten aus Text."""
return self.entity_extractor.extract_entities(text, level, source_id)
def extract_semantics(self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
"""Extrahiere Semantik für Entitäten."""
return self.semantic_extractor.extract_semantics(entities, text, level, source_id)
def extract_ontology(self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
"""Extrahiere Ontologie zwischen Entitäten."""
return self.ontology_extractor.extract_ontology(entities, text, level, source_id)
def extract_taxonomy(self, entities: list[dict], text: str, level: KnowledgeLevel, source_id: int) -> list[dict]:
"""Extrahiere Taxonomie für Entitäten."""
return self.taxonomy_extractor.extract_taxonomy(entities, text, level, source_id)
def analyze_page(self, page_id: int, text: str) -> dict:
"""
Vollständige Wissensanalyse für eine Seite.
Reihenfolge: Entitäten → Semantik → Ontologie → Taxonomie
"""
db.log("INFO", f"Starte Seitenanalyse für page_id={page_id}")
entities = self.extract_entities(text, KnowledgeLevel.PAGE, page_id)
semantics = self.extract_semantics(entities, text, KnowledgeLevel.PAGE, page_id)
ontology = self.extract_ontology(entities, text, KnowledgeLevel.PAGE, page_id)
taxonomy = self.extract_taxonomy(entities, text, KnowledgeLevel.PAGE, page_id)
return {
"page_id": page_id,
"entities": len(entities),
"semantics": len(semantics),
"ontology": len(ontology),
"taxonomy": len(taxonomy),
}
def analyze_section(self, section_id: int, text: str) -> dict:
"""Vollständige Wissensanalyse für einen Abschnitt."""
db.log("INFO", f"Starte Abschnittsanalyse für section_id={section_id}")
entities = self.extract_entities(text, KnowledgeLevel.SECTION, section_id)
semantics = self.extract_semantics(entities, text, KnowledgeLevel.SECTION, section_id)
ontology = self.extract_ontology(entities, text, KnowledgeLevel.SECTION, section_id)
taxonomy = self.extract_taxonomy(entities, text, KnowledgeLevel.SECTION, section_id)
return {
"section_id": section_id,
"entities": len(entities),
"semantics": len(semantics),
"ontology": len(ontology),
"taxonomy": len(taxonomy),
}
def analyze_document(self, document_id: int, text: str) -> dict:
"""Vollständige Wissensanalyse für ein Dokument (konsolidiert)."""
db.log("INFO", f"Starte Dokumentanalyse für document_id={document_id}")
entities = self.extract_entities(text, KnowledgeLevel.DOCUMENT, document_id)
semantics = self.extract_semantics(entities, text, KnowledgeLevel.DOCUMENT, document_id)
ontology = self.extract_ontology(entities, text, KnowledgeLevel.DOCUMENT, document_id)
taxonomy = self.extract_taxonomy(entities, text, KnowledgeLevel.DOCUMENT, document_id)
return {
"document_id": document_id,
"entities": len(entities),
"semantics": len(semantics),
"ontology": len(ontology),
"taxonomy": len(taxonomy),
}