test_taxonomy.py

Code Hygiene Score: 100

Keine Issues gefunden.

Dependencies 3

Funktionen 1

Code

#!/usr/bin/env python3
"""
Test script for Taxonomy Pipeline functionality.
Tests chunk and entity taxonomy assignments.
"""

import os
import sys

# Add pipeline directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from db import Database


def test_taxonomy_pipeline():
    """Test the taxonomy pipeline with existing chunks."""
    db = Database()

    if not db.connect():
        print("ERROR: Database connection failed")
        return False

    print("=== Taxonomy Pipeline Test ===\n")

    # 1. Get taxonomy terms
    print("[1] Loading taxonomy terms...")
    terms = db.get_taxonomy_terms()
    print(f"    Found {len(terms)} taxonomy terms")
    for term in terms[:5]:
        print(f"    - {term['id']}: {term['name']}")
    print()

    # 2. Get some chunks to test
    print("[2] Loading test chunks...")
    cursor = db.execute(
        """SELECT c.id, c.document_id, LEFT(c.content, 200) as content_preview
           FROM chunks c
           LIMIT 5"""
    )
    chunks = cursor.fetchall()
    cursor.close()
    print(f"    Found {len(chunks)} chunks for testing")
    print()

    # 3. Assign taxonomy based on content keywords
    print("[3] Assigning taxonomy terms to chunks...")
    keyword_map = {
        "Coaching": ["coaching", "coach", "begleitung"],
        "Kommunikation": ["kommunikation", "fragen", "gespräch", "dialog"],
        "Methoden": ["methode", "werkzeug", "tool", "intervention"],
        "Theorie": ["theorie", "konzept", "modell", "ansatz"],
        "Prozess": ["prozess", "ablauf", "schritt", "phase"],
        "Organisation": ["team", "organisation", "gruppe", "zusammenarbeit"],
        "Entwicklung": ["entwicklung", "veränderung", "wachstum"],
    }

    # Build term lookup
    term_lookup = {t["name"]: t["id"] for t in terms}

    assignments = 0
    for chunk in chunks:
        content_lower = chunk["content_preview"].lower()
        chunk_id = chunk["id"]

        for term_name, keywords in keyword_map.items():
            if term_name not in term_lookup:
                continue

            term_id = term_lookup[term_name]

            # Check if any keyword matches
            matches = sum(1 for kw in keywords if kw in content_lower)
            if matches > 0:
                # Calculate confidence based on matches
                confidence = min(0.5 + (matches * 0.15), 0.95)

                result = db.add_chunk_taxonomy(
                    chunk_id=chunk_id,
                    term_id=term_id,
                    confidence=confidence,
                    source="auto",
                )

                if result:
                    print(f"    Chunk {chunk_id} -> {term_name} (conf: {confidence:.2f})")
                    assignments += 1

    print(f"\n    Created {assignments} new taxonomy assignments")
    print()

    # 4. Verify mappings
    print("[4] Verifying taxonomy mappings...")
    cursor = db.execute("SELECT COUNT(*) as count FROM chunk_taxonomy")
    result = cursor.fetchone()
    cursor.close()
    print(f"    Total chunk_taxonomy entries: {result['count']}")

    # 5. Show sample mappings
    print("\n[5] Sample mappings with details:")
    cursor = db.execute(
        """SELECT ct.chunk_id, ct.confidence, ct.source, tt.name as term_name
           FROM chunk_taxonomy ct
           JOIN taxonomy_terms tt ON ct.taxonomy_term_id = tt.id
           ORDER BY ct.created_at DESC
           LIMIT 10"""
    )
    mappings = cursor.fetchall()
    cursor.close()

    for m in mappings:
        print(f"    Chunk {m['chunk_id']}: {m['term_name']} ({m['confidence']:.2f}, {m['source']})")

    # 6. Test entity taxonomy (if entities exist)
    print("\n[6] Checking entities for taxonomy assignment...")
    cursor = db.execute("SELECT id, name, type FROM entities LIMIT 3")
    entities = cursor.fetchall()
    cursor.close()

    if entities:
        print(f"    Found {len(entities)} entities")
        for entity in entities:
            # Assign first matching taxonomy term
            if entity["type"] and entity["type"] in term_lookup:
                term_id = term_lookup[entity["type"]]
                result = db.add_entity_taxonomy(
                    entity_id=entity["id"],
                    term_id=term_id,
                    relevance=0.8,
                    validated=False,
                )
                if result:
                    print(f"    Entity {entity['id']} ({entity['name']}) -> {entity['type']}")
    else:
        print("    No entities found to test")

    db.disconnect()
    print("\n=== Test Complete ===")
    return True


if __name__ == "__main__":
    success = test_taxonomy_pipeline()
    sys.exit(0 if success else 1)
← Übersicht