utils.py
- Pfad:
/var/www/scripts/pipeline/knowledge/utils.py - Namespace: pipeline
- Zeilen: 81 | Größe: 2,476 Bytes
- Geändert: 2025-12-25 15:50:37 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 100
- Dependencies: 100 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 4
- use sys
- use db.db
- use analyzer.KnowledgeExtractor
- use models.ModelConfig
Funktionen 2
-
get_model_config()Zeile 13 -
process_document_knowledge()Zeile 30
Code
"""Utility-Funktionen für Wissensextraktion."""
import sys
sys.path.insert(0, "/var/www/scripts/pipeline")
from db import db
from .analyzer import KnowledgeExtractor
from .models import ModelConfig
def get_model_config(provider: str = "ollama", model_name: str | None = None) -> ModelConfig:
"""
Erstelle Modellkonfiguration basierend auf Provider.
Args:
provider: 'ollama' oder 'anthropic'
model_name: Optional spezifisches Modell
Returns:
ModelConfig für den Extractor
"""
if provider == "anthropic":
return ModelConfig(provider="anthropic", model_name=model_name or "claude-3-haiku-20240307")
else:
return ModelConfig(provider="ollama", model_name=model_name or "gemma3:27b-it-qat")
def process_document_knowledge(document_id: int, provider: str = "ollama", model_name: str | None = None) -> dict:
"""
Verarbeite komplettes Dokument mit 3-Ebenen-Analyse.
1. Seiten-Ebene
2. Abschnitt-Ebene
3. Dokument-Ebene (konsolidiert)
"""
config = get_model_config(provider, model_name)
extractor = KnowledgeExtractor(config)
results = {"document_id": document_id, "pages": [], "sections": [], "document": None}
# 1. Seiten analysieren
cursor = db.execute(
"SELECT id, text_content FROM document_pages WHERE document_id = %s ORDER BY page_number",
(document_id,),
)
pages = cursor.fetchall()
cursor.close()
for page in pages:
if page["text_content"]:
page_result = extractor.analyze_page(page["id"], page["text_content"])
results["pages"].append(page_result)
# 2. Abschnitte analysieren
cursor = db.execute(
"SELECT id, content FROM document_sections WHERE document_id = %s ORDER BY sort_order",
(document_id,),
)
sections = cursor.fetchall()
cursor.close()
for section in sections:
if section["content"]:
section_result = extractor.analyze_section(section["id"], section["content"])
results["sections"].append(section_result)
# 3. Dokument-Ebene (Gesamttext)
cursor = db.execute(
"SELECT GROUP_CONCAT(content SEPARATOR ' ') as full_text FROM chunks WHERE document_id = %s",
(document_id,),
)
doc = cursor.fetchone()
cursor.close()
if doc and doc["full_text"]:
results["document"] = extractor.analyze_document(document_id, doc["full_text"][:10000])
return results