config.py
- Pfad:
/var/www/scripts/pipeline/config.py - Namespace: pipeline
- Zeilen: 151 | Größe: 4,508 Bytes
- Geändert: 2025-12-27 11:40:54 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 100
- Dependencies: 100 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 2
- use os
- use model_registry.get_chat_model
Funktionen 4
-
get_db_password()Zeile 87 -
get_anthropic_key()Zeile 104 -
get_chat_model_from_registry()Zeile 128 -
get_ollama_model_id()Zeile 139
Code
"""
KI-System Pipeline Configuration
"""
import os
# Nextcloud Settings
NEXTCLOUD_PATH = "/var/www/nextcloud/data/root/files/Documents"
SUPPORTED_EXTENSIONS = [".pdf", ".pptx", ".docx", ".md", ".txt"]
# MariaDB Settings - Content Database (chunks, entities, semantic data)
DB_CONFIG = {
"host": "localhost",
"database": "ki_content",
"user": "root",
"password": "", # Set via environment variable DB_PASSWORD
"charset": "utf8mb4",
}
# Logging/System Database (for pipeline_log)
DB_LOG_CONFIG = {
"host": "localhost",
"database": "ki_dev",
"user": "root",
"password": "",
"charset": "utf8mb4",
}
# Protokoll Database (for LLM call logging to ki_dev.protokoll)
DB_PROTOKOLL_CONFIG = {
"host": "localhost",
"database": "ki_dev",
"user": "root",
"password": "",
"charset": "utf8mb4",
}
# Qdrant Settings
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
QDRANT_COLLECTIONS = {
"documents": {"size": 1024, "distance": "Cosine"},
"mail": {"size": 1024, "distance": "Cosine"},
"entities": {"size": 1024, "distance": "Cosine"},
}
# Ollama Settings (Embeddings)
OLLAMA_HOST = "http://localhost:11434"
EMBEDDING_MODEL = "mxbai-embed-large" # 1024-dim, max ~1600 chars per chunk
EMBEDDING_DIMENSION = 1024
# Ollama Settings (Chat/Generation)
# NOTE: Now loaded dynamically from ki_dev.ai_models via model_registry.py
# OLLAMA_CHAT_MODEL is set at the end of this file after imports
# Anthropic Settings (Chat/Analysis)
ANTHROPIC_MODEL = "claude-opus-4-5-20251101"
# Chunking Settings
CHUNK_OVERLAP_PERCENT = 10
MIN_CHUNK_SIZE = 100
MAX_CHUNK_SIZE = 2000
# Pipeline Settings
MAX_RETRIES = 3
RETRY_BACKOFF_BASE = 2 # Seconds, exponential: 2, 4, 8
# Semantic Pipeline Settings (Layer 4)
SEMANTIC_SYNC = True # True = run semantic immediately, False = queue for async
SEMANTIC_USE_ANTHROPIC = False # True = Anthropic API, False = Ollama
SEMANTIC_QUEUE_BATCH_SIZE = 5 # Items to process per queue run
SEMANTIC_AUTO_QUEUE = True # Auto-queue after embedding if SEMANTIC_SYNC=False
# OCR Settings
OCR_ENABLED = True
OCR_LANGUAGE = "deu" # German
# Rotation Detection Settings
ROTATION_DETECTION_ENABLED = True
ROTATION_OSD_CONFIDENCE_THRESHOLD = 2.0 # Minimum confidence for OSD detection
ROTATION_USE_VISION_FALLBACK = False # Use vision LLM for images without text
# Logging
LOG_LEVEL = "INFO"
def get_db_password():
"""Get database password from environment or .env file."""
password = os.environ.get("DB_PASSWORD", "")
if not password:
password = os.environ.get("MARIADB_ROOT_PASSWORD", "")
if not password:
# Try to read from project .env file
env_file = "/var/www/dev.campus.systemische-tools.de/.env"
if os.path.exists(env_file):
with open(env_file) as f:
for line in f:
if line.startswith("MARIADB_ROOT_PASSWORD="):
password = line.split("=", 1)[1].strip()
break
return password
def get_anthropic_key():
"""Get Anthropic API key from environment or .env file."""
key = os.environ.get("ANTHROPIC_API_KEY", "")
if not key:
# Try to read from project .env file
env_file = "/var/www/dev.campus.systemische-tools.de/.env"
if os.path.exists(env_file):
with open(env_file) as f:
for line in f:
if line.startswith("ANTHROPIC_API_KEY="):
key = line.split("=", 1)[1].strip()
break
return key
# Update DB configs with password
DB_CONFIG["password"] = get_db_password()
DB_LOG_CONFIG["password"] = get_db_password()
DB_PROTOKOLL_CONFIG["password"] = get_db_password()
# Force Ollama for cost control (set to get_anthropic_key() for Anthropic)
ANTHROPIC_API_KEY = None
def get_chat_model_from_registry():
"""Get default chat model from central registry (ki_dev.ai_models)."""
try:
from model_registry import get_chat_model
return get_chat_model()
except Exception:
# Fallback if registry unavailable
return "ollama:gemma3:27b-it-qat"
def get_ollama_model_id(full_key: str) -> str:
"""Extract Ollama model ID from full_key.
Example: 'ollama:gemma3:27b-it-qat' -> 'gemma3:27b-it-qat'
"""
if full_key.startswith("ollama:"):
return full_key[7:]
return full_key
# Dynamic model loading from database
OLLAMA_CHAT_MODEL = get_ollama_model_id(get_chat_model_from_registry())