quality_test.py
- Pfad:
/var/www/scripts/pipeline/quality_test.py
- Namespace: pipeline
- Zeilen: 359 | Größe: 12,214 Bytes
- Geändert: 2025-12-31 03:01:09 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 65
- Dependencies: 20 (25%)
- LOC: 47 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 60 (10%)
Issues 4
| Zeile |
Typ |
Beschreibung |
| 249 |
magic_number |
Magic Number gefunden: 60 |
| 251 |
magic_number |
Magic Number gefunden: 60 |
| 278 |
magic_number |
Magic Number gefunden: 100 |
| 287 |
magic_number |
Magic Number gefunden: 60 |
Dependencies 13
- use json
- use os
- use sys
- use time
- use requests
- use constants.LLM_MAX_TOKENS
- use constants.MS_PER_SECOND
- use constants.PROMPT_TEXT_LIMIT
- use constants.PROMPT_TEXT_LIMIT_SHORT
- use constants.TEST_TIMEOUT
- use db.db
- use anthropic
- use re
Funktionen 8
-
get_anthropic_client()
Zeile 35
-
run_ollama()
Zeile 51
-
run_anthropic()
Zeile 79
-
extract_entities()
Zeile 99
-
classify_taxonomy()
Zeile 141
-
generate_questions()
Zeile 189
-
run_quality_test()
Zeile 226
-
print_report()
Zeile 293
Code
#!/usr/bin/env python3
"""
Quality comparison test for different LLM models in the pipeline.
Tests entity extraction, relation extraction, and taxonomy classification.
"""
import json
import os
import sys
import time
import requests
# Add pipeline directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from constants import (
LLM_MAX_TOKENS,
MS_PER_SECOND,
PROMPT_TEXT_LIMIT,
PROMPT_TEXT_LIMIT_SHORT,
TEST_TIMEOUT,
)
from db import db
OLLAMA_HOST = "http://localhost:11434"
# Models to test
MODELS = {
"gemma3": "gemma3:27b-it-qat",
"anthropic": "claude-opus-4-5-20251101",
}
def get_anthropic_client():
"""Get Anthropic API client."""
import anthropic
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
if not api_key:
env_file = "/var/www/dev.campus.systemische-tools.de/.env"
if os.path.exists(env_file):
with open(env_file) as f:
for line in f:
if line.startswith("ANTHROPIC_API_KEY="):
api_key = line.split("=", 1)[1].strip()
break
return anthropic.Anthropic(api_key=api_key) if api_key else None
def run_ollama(model, prompt, timeout=TEST_TIMEOUT):
"""Run prompt through Ollama model."""
start = time.time()
try:
response = requests.post(
f"{OLLAMA_HOST}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {"temperature": 0.3, "num_predict": LLM_MAX_TOKENS},
},
timeout=timeout,
)
response.raise_for_status()
data = response.json()
elapsed = time.time() - start
return {
"response": data.get("response", ""),
"tokens": data.get("eval_count", 0),
"duration_ms": elapsed * MS_PER_SECOND,
"success": True,
}
except Exception as e:
return {"response": "", "error": str(e), "success": False, "duration_ms": (time.time() - start) * MS_PER_SECOND}
def run_anthropic(client, prompt, model="claude-opus-4-5-20251101"):
"""Run prompt through Anthropic model."""
start = time.time()
try:
message = client.messages.create(
model=model, max_tokens=LLM_MAX_TOKENS, messages=[{"role": "user", "content": prompt}]
)
elapsed = time.time() - start
return {
"response": message.content[0].text,
"tokens": message.usage.input_tokens + message.usage.output_tokens,
"input_tokens": message.usage.input_tokens,
"output_tokens": message.usage.output_tokens,
"duration_ms": elapsed * MS_PER_SECOND,
"success": True,
}
except Exception as e:
return {"response": "", "error": str(e), "success": False, "duration_ms": (time.time() - start) * MS_PER_SECOND}
def extract_entities(text, model_name, model_id, client=None):
"""Extract entities using specified model."""
prompt = f"""Analysiere den folgenden deutschen Text und extrahiere alle wichtigen Entitäten.
Kategorisiere jede Entität als:
- PERSON (Namen von Personen)
- ORGANIZATION (Firmen, Institutionen, Gruppen)
- CONCEPT (Fachbegriffe, Methoden, Theorien)
- LOCATION (Orte, Länder)
- DATE (Zeitangaben)
- OTHER (Sonstiges)
Antworte NUR im JSON-Format:
{{"entities": [{{"name": "...", "type": "...", "context": "kurze Beschreibung"}}]}}
Text:
{text[:PROMPT_TEXT_LIMIT]}
"""
if model_name == "anthropic":
result = run_anthropic(client, prompt, model_id)
else:
result = run_ollama(model_id, prompt)
# Parse JSON
entities = []
if result["success"]:
try:
import re
json_match = re.search(r"\{[\s\S]*\}", result["response"])
if json_match:
data = json.loads(json_match.group())
entities = data.get("entities", [])
except (json.JSONDecodeError, AttributeError):
pass # JSON parsing failed, keep empty entities
result["entities"] = entities
result["entity_count"] = len(entities)
return result
def classify_taxonomy(text, model_name, model_id, client=None):
"""Classify text into taxonomy categories."""
prompt = f"""Klassifiziere den folgenden Text in passende Kategorien.
Wähle aus diesen Hauptkategorien:
- Methoden (Therapiemethoden, Coaching-Techniken)
- Theorie (Konzepte, Modelle, Grundlagen)
- Praxis (Anwendung, Fallbeispiele, Übungen)
- Organisation (Strukturen, Prozesse, Rollen)
- Kommunikation (Gesprächsführung, Interaktion)
- Entwicklung (Persönliche Entwicklung, Veränderung)
- Teamarbeit (Teamdynamik, Zusammenarbeit)
Antworte NUR im JSON-Format:
{{"categories": ["...", "..."], "confidence": 0.0-1.0, "reasoning": "kurze Begründung"}}
Text:
{text[:PROMPT_TEXT_LIMIT_SHORT]}
"""
if model_name == "anthropic":
result = run_anthropic(client, prompt, model_id)
else:
result = run_ollama(model_id, prompt)
# Parse JSON
categories = []
confidence = 0
reasoning = ""
if result["success"]:
try:
import re
json_match = re.search(r"\{[\s\S]*\}", result["response"])
if json_match:
data = json.loads(json_match.group())
categories = data.get("categories", [])
confidence = data.get("confidence", 0)
reasoning = data.get("reasoning", "")
except (json.JSONDecodeError, AttributeError):
pass # JSON parsing failed, keep empty categories
result["categories"] = categories
result["confidence"] = confidence
result["reasoning"] = reasoning
return result
def generate_questions(text, model_name, model_id, client=None):
"""Generate quiz questions from text."""
prompt = f"""Erstelle 3 Verständnisfragen zu folgendem Lerntext.
Die Fragen sollen das Verständnis der Kernkonzepte prüfen.
Antworte NUR im JSON-Format:
{{"questions": [
{{"question": "...", "answer": "...", "difficulty": "leicht|mittel|schwer"}}
]}}
Text:
{text[:PROMPT_TEXT_LIMIT_SHORT]}
"""
if model_name == "anthropic":
result = run_anthropic(client, prompt, model_id)
else:
result = run_ollama(model_id, prompt)
# Parse JSON
questions = []
if result["success"]:
try:
import re
json_match = re.search(r"\{[\s\S]*\}", result["response"])
if json_match:
data = json.loads(json_match.group())
questions = data.get("questions", [])
except (json.JSONDecodeError, AttributeError):
pass # JSON parsing failed, keep empty questions
result["questions"] = questions
result["question_count"] = len(questions)
return result
def run_quality_test(document_id):
"""Run full quality comparison test."""
db.connect()
# Get document content
cursor = db.execute(
"""SELECT c.content FROM chunks c
WHERE c.document_id = %s
ORDER BY c.chunk_index""",
(document_id,),
)
chunks = cursor.fetchall()
cursor.close()
full_text = "\n\n".join([c["content"] for c in chunks])
print(f"Dokument geladen: {len(full_text)} Zeichen, {len(chunks)} Chunks\n")
# Get Anthropic client
anthropic_client = get_anthropic_client()
results = {}
for model_name, model_id in MODELS.items():
print(f"\n{'=' * 60}")
print(f"TESTE: {model_name} ({model_id})")
print("=" * 60)
results[model_name] = {"model_id": model_id, "tests": {}}
# Skip Anthropic if no client
client = anthropic_client if model_name == "anthropic" else None
if model_name == "anthropic" and not client:
print(" ÜBERSPRUNGEN: Kein Anthropic API Key")
continue
# Test 1: Entity Extraction
print("\n[1/3] Entity Extraction...")
entity_result = extract_entities(full_text, model_name, model_id, client)
results[model_name]["tests"]["entities"] = entity_result
print(f" → {entity_result['entity_count']} Entitäten gefunden ({entity_result['duration_ms']:.0f}ms)")
if entity_result.get("entities"):
for e in entity_result["entities"][:5]:
print(f" • {e.get('name', '?')} ({e.get('type', '?')})")
# Test 2: Taxonomy Classification
print("\n[2/3] Taxonomy Classification...")
taxonomy_result = classify_taxonomy(full_text, model_name, model_id, client)
results[model_name]["tests"]["taxonomy"] = taxonomy_result
print(
f" → Kategorien: {', '.join(taxonomy_result['categories'])} (Konfidenz: {taxonomy_result['confidence']})"
)
if taxonomy_result.get("reasoning"):
print(f" Begründung: {taxonomy_result['reasoning'][:100]}...")
# Test 3: Question Generation
print("\n[3/3] Question Generation...")
question_result = generate_questions(full_text, model_name, model_id, client)
results[model_name]["tests"]["questions"] = question_result
print(f" → {question_result['question_count']} Fragen generiert ({question_result['duration_ms']:.0f}ms)")
if question_result.get("questions"):
for q in question_result["questions"][:3]:
print(f" Q: {q.get('question', '?')[:60]}...")
db.disconnect()
return results
def print_report(results):
"""Print detailed comparison report."""
print("\n")
print("=" * 80)
print("QUALITÄTSREPORT: Pipeline Output-Vergleich")
print("=" * 80)
# Entity comparison
print("\n### 1. ENTITY EXTRACTION ###\n")
print(f"{'Modell':<20} {'Entitäten':>10} {'Zeit (ms)':>12} {'Tokens':>10}")
print("-" * 55)
for model, data in results.items():
if "entities" in data.get("tests", {}):
e = data["tests"]["entities"]
tokens = e.get("tokens", e.get("output_tokens", "-"))
print(f"{model:<20} {e['entity_count']:>10} {e['duration_ms']:>12.0f} {tokens:>10}")
# Taxonomy comparison
print("\n### 2. TAXONOMY CLASSIFICATION ###\n")
for model, data in results.items():
if "taxonomy" in data.get("tests", {}):
t = data["tests"]["taxonomy"]
print(f"{model}: {', '.join(t['categories'])} (Konfidenz: {t['confidence']})")
# Question comparison
print("\n### 3. QUESTION GENERATION ###\n")
for model, data in results.items():
if "questions" in data.get("tests", {}):
q = data["tests"]["questions"]
print(f"\n{model} ({q['question_count']} Fragen, {q['duration_ms']:.0f}ms):")
for i, question in enumerate(q.get("questions", [])[:3], 1):
print(f" {i}. {question.get('question', '?')}")
print(f" → {question.get('answer', '?')[:80]}...")
# Summary
print("\n### ZUSAMMENFASSUNG ###\n")
summary = []
for model, data in results.items():
tests = data.get("tests", {})
total_time = sum(t.get("duration_ms", 0) for t in tests.values())
total_entities = tests.get("entities", {}).get("entity_count", 0)
total_questions = tests.get("questions", {}).get("question_count", 0)
categories = len(tests.get("taxonomy", {}).get("categories", []))
summary.append(
{
"model": model,
"time_ms": total_time,
"entities": total_entities,
"questions": total_questions,
"categories": categories,
}
)
print(f"{'Modell':<20} {'Gesamt-Zeit':>12} {'Entitäten':>10} {'Fragen':>8} {'Kategorien':>12}")
print("-" * 65)
for s in summary:
print(f"{s['model']:<20} {s['time_ms']:>10.0f}ms {s['entities']:>10} {s['questions']:>8} {s['categories']:>12}")
if __name__ == "__main__":
doc_id = int(sys.argv[1]) if len(sys.argv) > 1 else 2
print(f"Starte Qualitätstest für Dokument {doc_id}...\n")
results = run_quality_test(doc_id)
print_report(results)