test_taxonomy.py
- Pfad:
/var/www/scripts/pipeline/test_taxonomy.py - Namespace: pipeline
- Zeilen: 145 | Größe: 4,713 Bytes
- Geändert: 2025-12-28 09:03:09 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 100
- Dependencies: 100 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 3
- use os
- use sys
- use db.Database
Funktionen 1
-
test_taxonomy_pipeline()Zeile 16
Code
#!/usr/bin/env python3
"""
Test script for Taxonomy Pipeline functionality.
Tests chunk and entity taxonomy assignments.
"""
import os
import sys
# Add pipeline directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from db import Database
def test_taxonomy_pipeline():
"""Test the taxonomy pipeline with existing chunks."""
db = Database()
if not db.connect():
print("ERROR: Database connection failed")
return False
print("=== Taxonomy Pipeline Test ===\n")
# 1. Get taxonomy terms
print("[1] Loading taxonomy terms...")
terms = db.get_taxonomy_terms()
print(f" Found {len(terms)} taxonomy terms")
for term in terms[:5]:
print(f" - {term['id']}: {term['name']}")
print()
# 2. Get some chunks to test
print("[2] Loading test chunks...")
cursor = db.execute(
"""SELECT c.id, c.document_id, LEFT(c.content, 200) as content_preview
FROM chunks c
LIMIT 5"""
)
chunks = cursor.fetchall()
cursor.close()
print(f" Found {len(chunks)} chunks for testing")
print()
# 3. Assign taxonomy based on content keywords
print("[3] Assigning taxonomy terms to chunks...")
keyword_map = {
"Coaching": ["coaching", "coach", "begleitung"],
"Kommunikation": ["kommunikation", "fragen", "gespräch", "dialog"],
"Methoden": ["methode", "werkzeug", "tool", "intervention"],
"Theorie": ["theorie", "konzept", "modell", "ansatz"],
"Prozess": ["prozess", "ablauf", "schritt", "phase"],
"Organisation": ["team", "organisation", "gruppe", "zusammenarbeit"],
"Entwicklung": ["entwicklung", "veränderung", "wachstum"],
}
# Build term lookup
term_lookup = {t["name"]: t["id"] for t in terms}
assignments = 0
for chunk in chunks:
content_lower = chunk["content_preview"].lower()
chunk_id = chunk["id"]
for term_name, keywords in keyword_map.items():
if term_name not in term_lookup:
continue
term_id = term_lookup[term_name]
# Check if any keyword matches
matches = sum(1 for kw in keywords if kw in content_lower)
if matches > 0:
# Calculate confidence based on matches
confidence = min(0.5 + (matches * 0.15), 0.95)
result = db.add_chunk_taxonomy(
chunk_id=chunk_id,
term_id=term_id,
confidence=confidence,
source="auto",
)
if result:
print(f" Chunk {chunk_id} -> {term_name} (conf: {confidence:.2f})")
assignments += 1
print(f"\n Created {assignments} new taxonomy assignments")
print()
# 4. Verify mappings
print("[4] Verifying taxonomy mappings...")
cursor = db.execute("SELECT COUNT(*) as count FROM chunk_taxonomy")
result = cursor.fetchone()
cursor.close()
print(f" Total chunk_taxonomy entries: {result['count']}")
# 5. Show sample mappings
print("\n[5] Sample mappings with details:")
cursor = db.execute(
"""SELECT ct.chunk_id, ct.confidence, ct.source, tt.name as term_name
FROM chunk_taxonomy ct
JOIN taxonomy_terms tt ON ct.taxonomy_term_id = tt.id
ORDER BY ct.created_at DESC
LIMIT 10"""
)
mappings = cursor.fetchall()
cursor.close()
for m in mappings:
print(f" Chunk {m['chunk_id']}: {m['term_name']} ({m['confidence']:.2f}, {m['source']})")
# 6. Test entity taxonomy (if entities exist)
print("\n[6] Checking entities for taxonomy assignment...")
cursor = db.execute("SELECT id, name, type FROM entities LIMIT 3")
entities = cursor.fetchall()
cursor.close()
if entities:
print(f" Found {len(entities)} entities")
for entity in entities:
# Assign first matching taxonomy term
if entity["type"] and entity["type"] in term_lookup:
term_id = term_lookup[entity["type"]]
result = db.add_entity_taxonomy(
entity_id=entity["id"],
term_id=term_id,
relevance=0.8,
validated=False,
)
if result:
print(f" Entity {entity['id']} ({entity['name']}) -> {entity['type']}")
else:
print(" No entities found to test")
db.disconnect()
print("\n=== Test Complete ===")
return True
if __name__ == "__main__":
success = test_taxonomy_pipeline()
sys.exit(0 if success else 1)