ontology_classifier.py

Code Hygiene Score: 100

Keine Issues gefunden.

Dependencies 2

Funktionen 3

Code

"""
Ontology Classifier - Classify entities into ontology classes.
"""

import sys

sys.path.insert(0, "/var/www/scripts/pipeline")

from db import db

# Keyword-based classification rules
CLASSIFICATION_RULES = {
    1: {  # Coaching-Methode
        "keywords": [
            "methode",
            "technik",
            "tool",
            "werkzeug",
            "intervention",
            "übung",
            "format",
            "frage",
            "skalierung",
            "aufstellung",
            "visualisierung",
            "reflexion",
            "fragetechnik",
        ],
        "entity_types": ["METHOD", "TOOL"],
    },
    2: {  # Coaching-Konzept
        "keywords": [
            "konzept",
            "theorie",
            "modell",
            "ansatz",
            "prinzip",
            "grundlage",
            "haltung",
            "systemisch",
            "lösungsorientiert",
            "konstruktivismus",
            "philosophie",
            "paradigma",
        ],
        "entity_types": ["CONCEPT", "THEORY"],
    },
    3: {  # Coaching-Prozess
        "keywords": [
            "prozess",
            "phase",
            "ablauf",
            "schritt",
            "struktur",
            "sitzung",
            "gespräch",
            "dialog",
            "begleitung",
            "verlauf",
        ],
        "entity_types": ["PROCESS"],
    },
    4: {  # Team-Intervention
        "keywords": [
            "team",
            "gruppe",
            "organisation",
            "zusammenarbeit",
            "konflikt",
            "dynamik",
            "rolle",
            "moderation",
        ],
        "entity_types": ["TEAM", "GROUP"],
    },
}


def classify_entity(entity: dict) -> list[dict]:
    """
    Classify an entity into ontology classes.
    Returns list of {ontology_class_id, confidence} dicts.
    """
    name = (entity.get("name") or "").lower()
    description = (entity.get("description") or "").lower()
    entity_type = entity.get("type", "")
    combined = f"{name} {description}"

    classifications = []

    for class_id, rules in CLASSIFICATION_RULES.items():
        confidence = 0.0

        # Check entity type match
        if entity_type in rules.get("entity_types", []):
            confidence += 0.3

        # Check keyword matches
        keyword_matches = sum(1 for kw in rules["keywords"] if kw in combined)
        if keyword_matches > 0:
            confidence += min(0.6, keyword_matches * 0.15)

        if confidence >= 0.3:
            classifications.append(
                {
                    "ontology_class_id": class_id,
                    "confidence": min(confidence, 1.0),
                }
            )

    # Sort by confidence descending
    classifications.sort(key=lambda x: x["confidence"], reverse=True)

    return classifications


def classify_entities(document_id: int) -> int:
    """
    Classify all entities from a document into ontology classes.
    Stores results in entity_classifications table.
    """
    # Get entities linked to this document via chunk_entities
    cursor = db.execute(
        """
        SELECT DISTINCT e.id, e.name, e.type, e.description
        FROM entities e
        JOIN chunk_entities ce ON e.id = ce.entity_id
        JOIN chunks c ON ce.chunk_id = c.id
        WHERE c.document_id = %s
        """,
        (document_id,),
    )
    entities = cursor.fetchall()
    cursor.close()

    classified = 0

    for entity in entities:
        classifications = classify_entity(entity)

        for cls in classifications:
            try:
                # Insert or update classification
                cursor = db.execute(
                    """
                    INSERT INTO entity_classifications (entity_id, ontology_class_id, confidence)
                    VALUES (%s, %s, %s)
                    ON DUPLICATE KEY UPDATE confidence = VALUES(confidence)
                    """,
                    (entity["id"], cls["ontology_class_id"], cls["confidence"]),
                )
                db.commit()
                cursor.close()
                classified += 1
            except Exception as e:
                db.log("WARNING", f"Failed to classify entity {entity['id']}: {e}")

    db.log("INFO", f"Classified {classified} entity-class mappings for document {document_id}")
    return classified


def classify_all_entities() -> int:
    """Classify all entities in the database."""
    cursor = db.execute("SELECT id, name, type, description FROM entities")
    entities = cursor.fetchall()
    cursor.close()

    classified = 0

    for entity in entities:
        classifications = classify_entity(entity)

        for cls in classifications:
            try:
                cursor = db.execute(
                    """
                    INSERT INTO entity_classifications (entity_id, ontology_class_id, confidence)
                    VALUES (%s, %s, %s)
                    ON DUPLICATE KEY UPDATE confidence = VALUES(confidence)
                    """,
                    (entity["id"], cls["ontology_class_id"], cls["confidence"]),
                )
                db.commit()
                cursor.close()
                classified += 1
            except Exception as e:
                db.log("WARNING", f"Failed to classify entity {entity['id']}: {e}")

    return classified
← Übersicht