{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/migration\/content-pipeline\/src\/config.py",
"content": "\"\"\"\nContent Pipeline Configuration\n\nAll settings loaded from environment variables for portability.\nSee config\/settings.env.example for available options.\n\"\"\"\n\nimport os\nfrom pathlib import Path\n\n# =============================================================================\n# PATHS\n# =============================================================================\n\n# Document source directory\nNEXTCLOUD_PATH = os.environ.get(\n \"PIPELINE_DOCUMENT_PATH\",\n str(Path.home() \/ \"documents\")\n)\nSUPPORTED_EXTENSIONS = [\".pdf\", \".pptx\", \".docx\", \".md\", \".txt\"]\n\n# =============================================================================\n# DATABASE - Content (chunks, entities, semantic data)\n# =============================================================================\n\nDB_CONFIG = {\n \"host\": os.environ.get(\"DB_HOST\", \"localhost\"),\n \"database\": os.environ.get(\"DB_NAME\", \"content_pipeline\"),\n \"user\": os.environ.get(\"DB_USER\", \"pipeline\"),\n \"password\": \"\", # Set via get_db_password()\n \"charset\": \"utf8mb4\",\n}\n\n# Logging\/System Database (defaults to same as content DB)\nDB_LOG_CONFIG = {\n \"host\": os.environ.get(\"DB_LOG_HOST\", os.environ.get(\"DB_HOST\", \"localhost\")),\n \"database\": os.environ.get(\"DB_LOG_NAME\", os.environ.get(\"DB_NAME\", \"content_pipeline\")),\n \"user\": os.environ.get(\"DB_LOG_USER\", os.environ.get(\"DB_USER\", \"pipeline\")),\n \"password\": \"\",\n \"charset\": \"utf8mb4\",\n}\n\n# Protokoll Database (for LLM call logging)\nDB_PROTOKOLL_CONFIG = {\n \"host\": os.environ.get(\"DB_PROTOKOLL_HOST\", os.environ.get(\"DB_HOST\", \"localhost\")),\n \"database\": os.environ.get(\"DB_PROTOKOLL_NAME\", os.environ.get(\"DB_NAME\", \"content_pipeline\")),\n \"user\": os.environ.get(\"DB_PROTOKOLL_USER\", os.environ.get(\"DB_USER\", \"pipeline\")),\n \"password\": \"\",\n \"charset\": \"utf8mb4\",\n}\n\n# =============================================================================\n# QDRANT VECTOR DATABASE\n# =============================================================================\n\nQDRANT_HOST = os.environ.get(\"QDRANT_HOST\", \"localhost\")\nQDRANT_PORT = int(os.environ.get(\"QDRANT_PORT\", \"6333\"))\n\nQDRANT_COLLECTIONS = {\n os.environ.get(\"QDRANT_COLLECTION_DOCUMENTS\", \"documents\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n os.environ.get(\"QDRANT_COLLECTION_MAIL\", \"mail\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n os.environ.get(\"QDRANT_COLLECTION_ENTITIES\", \"entities\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n}\n\n# =============================================================================\n# OLLAMA - Local LLM\n# =============================================================================\n\nOLLAMA_HOST = os.environ.get(\"OLLAMA_HOST\", \"http:\/\/localhost:11434\")\n\n# Embedding model\nEMBEDDING_MODEL = os.environ.get(\"OLLAMA_EMBEDDING_MODEL\", \"mxbai-embed-large\")\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", \"1024\"))\n\n# Chat model - set dynamically or via environment\nOLLAMA_CHAT_MODEL = os.environ.get(\"OLLAMA_CHAT_MODEL\", \"llama3.2:3b\")\n\n# =============================================================================\n# ANTHROPIC API (Optional)\n# =============================================================================\n\nANTHROPIC_MODEL = os.environ.get(\"ANTHROPIC_MODEL\", \"claude-sonnet-4-20250514\")\nANTHROPIC_API_KEY = None # Set via get_anthropic_key()\n\n# =============================================================================\n# CHUNKING SETTINGS\n# =============================================================================\n\nCHUNK_OVERLAP_PERCENT = int(os.environ.get(\"CHUNK_OVERLAP_PERCENT\", \"10\"))\nMIN_CHUNK_SIZE = int(os.environ.get(\"MIN_CHUNK_SIZE\", \"100\"))\nMAX_CHUNK_SIZE = int(os.environ.get(\"MAX_CHUNK_SIZE\", \"2000\"))\n\n# =============================================================================\n# PIPELINE SETTINGS\n# =============================================================================\n\nMAX_RETRIES = int(os.environ.get(\"PIPELINE_MAX_RETRIES\", \"3\"))\nRETRY_BACKOFF_BASE = int(os.environ.get(\"PIPELINE_RETRY_BACKOFF\", \"2\"))\n\n# Semantic analysis\nSEMANTIC_SYNC = os.environ.get(\"SEMANTIC_SYNC\", \"true\").lower() == \"true\"\nSEMANTIC_USE_ANTHROPIC = os.environ.get(\"SEMANTIC_USE_ANTHROPIC\", \"false\").lower() == \"true\"\nSEMANTIC_QUEUE_BATCH_SIZE = int(os.environ.get(\"SEMANTIC_QUEUE_BATCH_SIZE\", \"5\"))\nSEMANTIC_AUTO_QUEUE = os.environ.get(\"SEMANTIC_AUTO_QUEUE\", \"true\").lower() == \"true\"\n\n# =============================================================================\n# OCR SETTINGS\n# =============================================================================\n\nOCR_ENABLED = os.environ.get(\"OCR_ENABLED\", \"true\").lower() == \"true\"\nOCR_LANGUAGE = os.environ.get(\"OCR_LANGUAGE\", \"deu\")\n\n# Rotation detection\nROTATION_DETECTION_ENABLED = os.environ.get(\"ROTATION_DETECTION_ENABLED\", \"true\").lower() == \"true\"\nROTATION_OSD_CONFIDENCE_THRESHOLD = float(os.environ.get(\"ROTATION_OSD_CONFIDENCE_THRESHOLD\", \"2.0\"))\nROTATION_USE_VISION_FALLBACK = os.environ.get(\"ROTATION_USE_VISION_FALLBACK\", \"false\").lower() == \"true\"\n\n# =============================================================================\n# LOGGING\n# =============================================================================\n\nLOG_LEVEL = os.environ.get(\"LOG_LEVEL\", \"INFO\")\n\n\n# =============================================================================\n# HELPER FUNCTIONS\n# =============================================================================\n\ndef get_db_password():\n \"\"\"Get database password from environment.\"\"\"\n password = os.environ.get(\"DB_PASSWORD\", \"\")\n if not password:\n password = os.environ.get(\"MARIADB_ROOT_PASSWORD\", \"\")\n if not password:\n # Try .env file in project root\n env_file = Path(__file__).parent.parent \/ \".env\"\n if env_file.exists():\n with open(env_file) as f:\n for line in f:\n line = line.strip()\n if line.startswith(\"DB_PASSWORD=\"):\n password = line.split(\"=\", 1)[1].strip().strip('\"').strip(\"'\")\n break\n return password\n\n\ndef get_anthropic_key():\n \"\"\"Get Anthropic API key from environment.\"\"\"\n key = os.environ.get(\"ANTHROPIC_API_KEY\", \"\")\n if not key:\n # Try .env file in project root\n env_file = Path(__file__).parent.parent \/ \".env\"\n if env_file.exists():\n with open(env_file) as f:\n for line in f:\n line = line.strip()\n if line.startswith(\"ANTHROPIC_API_KEY=\"):\n key = line.split(\"=\", 1)[1].strip().strip('\"').strip(\"'\")\n break\n return key\n\n\ndef get_ollama_model_id(full_key: str) -> str:\n \"\"\"Extract Ollama model ID from full_key.\n\n Example: 'ollama:gemma3:27b-it-qat' -> 'gemma3:27b-it-qat'\n \"\"\"\n if full_key.startswith(\"ollama:\"):\n return full_key[7:]\n return full_key\n\n\n# =============================================================================\n# INITIALIZE\n# =============================================================================\n\n# Update DB configs with password\nDB_CONFIG[\"password\"] = get_db_password()\nDB_LOG_CONFIG[\"password\"] = get_db_password()\nDB_PROTOKOLL_CONFIG[\"password\"] = get_db_password()\n\n# Set Anthropic key if available\nANTHROPIC_API_KEY = get_anthropic_key() or None\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/migration\/content-pipeline\/src\/config.py",
"content": "\"\"\"\nContent Pipeline Configuration\n\nAll settings loaded from environment variables for portability.\nSee config\/settings.env.example for available options.\n\"\"\"\n\nimport os\nfrom pathlib import Path\n\n# =============================================================================\n# PATHS\n# =============================================================================\n\n# Document source directory\nNEXTCLOUD_PATH = os.environ.get(\n \"PIPELINE_DOCUMENT_PATH\",\n str(Path.home() \/ \"documents\")\n)\nSUPPORTED_EXTENSIONS = [\".pdf\", \".pptx\", \".docx\", \".md\", \".txt\"]\n\n# =============================================================================\n# DATABASE - Content (chunks, entities, semantic data)\n# =============================================================================\n\nDB_CONFIG = {\n \"host\": os.environ.get(\"DB_HOST\", \"localhost\"),\n \"database\": os.environ.get(\"DB_NAME\", \"content_pipeline\"),\n \"user\": os.environ.get(\"DB_USER\", \"pipeline\"),\n \"password\": \"\", # Set via get_db_password()\n \"charset\": \"utf8mb4\",\n}\n\n# Logging\/System Database (defaults to same as content DB)\nDB_LOG_CONFIG = {\n \"host\": os.environ.get(\"DB_LOG_HOST\", os.environ.get(\"DB_HOST\", \"localhost\")),\n \"database\": os.environ.get(\"DB_LOG_NAME\", os.environ.get(\"DB_NAME\", \"content_pipeline\")),\n \"user\": os.environ.get(\"DB_LOG_USER\", os.environ.get(\"DB_USER\", \"pipeline\")),\n \"password\": \"\",\n \"charset\": \"utf8mb4\",\n}\n\n# Protokoll Database (for LLM call logging)\nDB_PROTOKOLL_CONFIG = {\n \"host\": os.environ.get(\"DB_PROTOKOLL_HOST\", os.environ.get(\"DB_HOST\", \"localhost\")),\n \"database\": os.environ.get(\"DB_PROTOKOLL_NAME\", os.environ.get(\"DB_NAME\", \"content_pipeline\")),\n \"user\": os.environ.get(\"DB_PROTOKOLL_USER\", os.environ.get(\"DB_USER\", \"pipeline\")),\n \"password\": \"\",\n \"charset\": \"utf8mb4\",\n}\n\n# =============================================================================\n# QDRANT VECTOR DATABASE\n# =============================================================================\n\nQDRANT_HOST = os.environ.get(\"QDRANT_HOST\", \"localhost\")\nQDRANT_PORT = int(os.environ.get(\"QDRANT_PORT\", \"6333\"))\n\nQDRANT_COLLECTIONS = {\n os.environ.get(\"QDRANT_COLLECTION_DOCUMENTS\", \"documents\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n os.environ.get(\"QDRANT_COLLECTION_MAIL\", \"mail\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n os.environ.get(\"QDRANT_COLLECTION_ENTITIES\", \"entities\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n}\n\n# =============================================================================\n# OLLAMA - Local LLM\n# =============================================================================\n\nOLLAMA_HOST = os.environ.get(\"OLLAMA_HOST\", \"http:\/\/localhost:11434\")\n\n# Embedding model\nEMBEDDING_MODEL = os.environ.get(\"OLLAMA_EMBEDDING_MODEL\", \"mxbai-embed-large\")\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", \"1024\"))\n\n# Chat model - set dynamically or via environment\nOLLAMA_CHAT_MODEL = os.environ.get(\"OLLAMA_CHAT_MODEL\", \"llama3.2:3b\")\n\n# =============================================================================\n# ANTHROPIC API (Optional)\n# =============================================================================\n\nANTHROPIC_MODEL = os.environ.get(\"ANTHROPIC_MODEL\", \"claude-sonnet-4-20250514\")\nANTHROPIC_API_KEY = None # Set via get_anthropic_key()\n\n# =============================================================================\n# CHUNKING SETTINGS\n# =============================================================================\n\nCHUNK_OVERLAP_PERCENT = int(os.environ.get(\"CHUNK_OVERLAP_PERCENT\", \"10\"))\nMIN_CHUNK_SIZE = int(os.environ.get(\"MIN_CHUNK_SIZE\", \"100\"))\nMAX_CHUNK_SIZE = int(os.environ.get(\"MAX_CHUNK_SIZE\", \"2000\"))\n\n# =============================================================================\n# PIPELINE SETTINGS\n# =============================================================================\n\nMAX_RETRIES = int(os.environ.get(\"PIPELINE_MAX_RETRIES\", \"3\"))\nRETRY_BACKOFF_BASE = int(os.environ.get(\"PIPELINE_RETRY_BACKOFF\", \"2\"))\n\n# Semantic analysis\nSEMANTIC_SYNC = os.environ.get(\"SEMANTIC_SYNC\", \"true\").lower() == \"true\"\nSEMANTIC_USE_ANTHROPIC = os.environ.get(\"SEMANTIC_USE_ANTHROPIC\", \"false\").lower() == \"true\"\nSEMANTIC_QUEUE_BATCH_SIZE = int(os.environ.get(\"SEMANTIC_QUEUE_BATCH_SIZE\", \"5\"))\nSEMANTIC_AUTO_QUEUE = os.environ.get(\"SEMANTIC_AUTO_QUEUE\", \"true\").lower() == \"true\"\n\n# =============================================================================\n# OCR SETTINGS\n# =============================================================================\n\nOCR_ENABLED = os.environ.get(\"OCR_ENABLED\", \"true\").lower() == \"true\"\nOCR_LANGUAGE = os.environ.get(\"OCR_LANGUAGE\", \"deu\")\n\n# Rotation detection\nROTATION_DETECTION_ENABLED = os.environ.get(\"ROTATION_DETECTION_ENABLED\", \"true\").lower() == \"true\"\nROTATION_OSD_CONFIDENCE_THRESHOLD = float(os.environ.get(\"ROTATION_OSD_CONFIDENCE_THRESHOLD\", \"2.0\"))\nROTATION_USE_VISION_FALLBACK = os.environ.get(\"ROTATION_USE_VISION_FALLBACK\", \"false\").lower() == \"true\"\n\n# =============================================================================\n# LOGGING\n# =============================================================================\n\nLOG_LEVEL = os.environ.get(\"LOG_LEVEL\", \"INFO\")\n\n\n# =============================================================================\n# HELPER FUNCTIONS\n# =============================================================================\n\ndef get_db_password():\n \"\"\"Get database password from environment.\"\"\"\n password = os.environ.get(\"DB_PASSWORD\", \"\")\n if not password:\n password = os.environ.get(\"MARIADB_ROOT_PASSWORD\", \"\")\n if not password:\n # Try .env file in project root\n env_file = Path(__file__).parent.parent \/ \".env\"\n if env_file.exists():\n with open(env_file) as f:\n for line in f:\n line = line.strip()\n if line.startswith(\"DB_PASSWORD=\"):\n password = line.split(\"=\", 1)[1].strip().strip('\"').strip(\"'\")\n break\n return password\n\n\ndef get_anthropic_key():\n \"\"\"Get Anthropic API key from environment.\"\"\"\n key = os.environ.get(\"ANTHROPIC_API_KEY\", \"\")\n if not key:\n # Try .env file in project root\n env_file = Path(__file__).parent.parent \/ \".env\"\n if env_file.exists():\n with open(env_file) as f:\n for line in f:\n line = line.strip()\n if line.startswith(\"ANTHROPIC_API_KEY=\"):\n key = line.split(\"=\", 1)[1].strip().strip('\"').strip(\"'\")\n break\n return key\n\n\ndef get_ollama_model_id(full_key: str) -> str:\n \"\"\"Extract Ollama model ID from full_key.\n\n Example: 'ollama:gemma3:27b-it-qat' -> 'gemma3:27b-it-qat'\n \"\"\"\n if full_key.startswith(\"ollama:\"):\n return full_key[7:]\n return full_key\n\n\n# =============================================================================\n# INITIALIZE\n# =============================================================================\n\n# Update DB configs with password\nDB_CONFIG[\"password\"] = get_db_password()\nDB_LOG_CONFIG[\"password\"] = get_db_password()\nDB_PROTOKOLL_CONFIG[\"password\"] = get_db_password()\n\n# Set Anthropic key if available\nANTHROPIC_API_KEY = get_anthropic_key() or None\n",
"structuredPatch": [],
"originalFile": null
}
}