utils.py

Code Hygiene Score: 100

Keine Issues gefunden.

Dependencies 4

Funktionen 2

Code

"""Utility-Funktionen für Wissensextraktion."""

import sys

sys.path.insert(0, "/var/www/scripts/pipeline")

from db import db

from .analyzer import KnowledgeExtractor
from .models import ModelConfig


def get_model_config(provider: str = "ollama", model_name: str | None = None) -> ModelConfig:
    """
    Erstelle Modellkonfiguration basierend auf Provider.

    Args:
        provider: 'ollama' oder 'anthropic'
        model_name: Optional spezifisches Modell

    Returns:
        ModelConfig für den Extractor
    """
    if provider == "anthropic":
        return ModelConfig(provider="anthropic", model_name=model_name or "claude-3-haiku-20240307")
    else:
        return ModelConfig(provider="ollama", model_name=model_name or "gemma3:27b-it-qat")


def process_document_knowledge(document_id: int, provider: str = "ollama", model_name: str | None = None) -> dict:
    """
    Verarbeite komplettes Dokument mit 3-Ebenen-Analyse.

    1. Seiten-Ebene
    2. Abschnitt-Ebene
    3. Dokument-Ebene (konsolidiert)
    """
    config = get_model_config(provider, model_name)
    extractor = KnowledgeExtractor(config)

    results = {"document_id": document_id, "pages": [], "sections": [], "document": None}

    # 1. Seiten analysieren
    cursor = db.execute(
        "SELECT id, text_content FROM document_pages WHERE document_id = %s ORDER BY page_number",
        (document_id,),
    )
    pages = cursor.fetchall()
    cursor.close()

    for page in pages:
        if page["text_content"]:
            page_result = extractor.analyze_page(page["id"], page["text_content"])
            results["pages"].append(page_result)

    # 2. Abschnitte analysieren
    cursor = db.execute(
        "SELECT id, content FROM document_sections WHERE document_id = %s ORDER BY sort_order",
        (document_id,),
    )
    sections = cursor.fetchall()
    cursor.close()

    for section in sections:
        if section["content"]:
            section_result = extractor.analyze_section(section["id"], section["content"])
            results["sections"].append(section_result)

    # 3. Dokument-Ebene (Gesamttext)
    cursor = db.execute(
        "SELECT GROUP_CONCAT(content SEPARATOR ' ') as full_text FROM chunks WHERE document_id = %s",
        (document_id,),
    )
    doc = cursor.fetchone()
    cursor.close()

    if doc and doc["full_text"]:
        results["document"] = extractor.analyze_document(document_id, doc["full_text"][:10000])

    return results
← Übersicht