{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/migration\/content-pipeline\/config\/settings.py",
"content": "\"\"\"\nContent Pipeline - Abstracted Configuration\n\nAll settings are loaded from environment variables for portability.\nCopy settings.env.example to .env and adjust values.\n\"\"\"\n\nimport os\nfrom pathlib import Path\n\n# =============================================================================\n# PATHS\n# =============================================================================\n\n# Document source directory\nDOCUMENT_SOURCE_PATH = os.environ.get(\n \"PIPELINE_DOCUMENT_PATH\",\n \"\/path\/to\/your\/documents\"\n)\n\n# Supported file extensions\nSUPPORTED_EXTENSIONS = [\".pdf\", \".pptx\", \".docx\", \".md\", \".txt\"]\n\n# =============================================================================\n# DATABASE - Content (chunks, entities, semantic data)\n# =============================================================================\n\nDB_CONFIG = {\n \"host\": os.environ.get(\"DB_HOST\", \"localhost\"),\n \"database\": os.environ.get(\"DB_NAME\", \"content_pipeline\"),\n \"user\": os.environ.get(\"DB_USER\", \"pipeline\"),\n \"password\": os.environ.get(\"DB_PASSWORD\", \"\"),\n \"charset\": \"utf8mb4\",\n}\n\n# Optional: Separate logging database\nDB_LOG_CONFIG = {\n \"host\": os.environ.get(\"DB_LOG_HOST\", os.environ.get(\"DB_HOST\", \"localhost\")),\n \"database\": os.environ.get(\"DB_LOG_NAME\", os.environ.get(\"DB_NAME\", \"content_pipeline\")),\n \"user\": os.environ.get(\"DB_LOG_USER\", os.environ.get(\"DB_USER\", \"pipeline\")),\n \"password\": os.environ.get(\"DB_LOG_PASSWORD\", os.environ.get(\"DB_PASSWORD\", \"\")),\n \"charset\": \"utf8mb4\",\n}\n\n# =============================================================================\n# QDRANT VECTOR DATABASE\n# =============================================================================\n\nQDRANT_HOST = os.environ.get(\"QDRANT_HOST\", \"localhost\")\nQDRANT_PORT = int(os.environ.get(\"QDRANT_PORT\", \"6333\"))\n\n# Collection configuration\n# Format: {collection_name: {size: embedding_dim, distance: metric}}\nQDRANT_COLLECTIONS = {\n os.environ.get(\"QDRANT_COLLECTION_DOCUMENTS\", \"documents\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n os.environ.get(\"QDRANT_COLLECTION_ENTITIES\", \"entities\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n}\n\n# =============================================================================\n# OLLAMA - Local LLM (Embeddings & Chat)\n# =============================================================================\n\nOLLAMA_HOST = os.environ.get(\"OLLAMA_HOST\", \"http:\/\/localhost:11434\")\n\n# Embedding model - must output vectors matching QDRANT collection size\nEMBEDDING_MODEL = os.environ.get(\"OLLAMA_EMBEDDING_MODEL\", \"mxbai-embed-large\")\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", \"1024\"))\n\n# Chat\/Generation model for analysis\nOLLAMA_CHAT_MODEL = os.environ.get(\"OLLAMA_CHAT_MODEL\", \"llama3.2:3b\")\n\n# =============================================================================\n# ANTHROPIC API (Optional - for higher quality analysis)\n# =============================================================================\n\nANTHROPIC_API_KEY = os.environ.get(\"ANTHROPIC_API_KEY\", None)\nANTHROPIC_MODEL = os.environ.get(\"ANTHROPIC_MODEL\", \"claude-sonnet-4-20250514\")\n\n# =============================================================================\n# CHUNKING SETTINGS\n# =============================================================================\n\nCHUNK_OVERLAP_PERCENT = int(os.environ.get(\"CHUNK_OVERLAP_PERCENT\", \"10\"))\nMIN_CHUNK_SIZE = int(os.environ.get(\"MIN_CHUNK_SIZE\", \"100\"))\nMAX_CHUNK_SIZE = int(os.environ.get(\"MAX_CHUNK_SIZE\", \"2000\"))\n\n# =============================================================================\n# PIPELINE SETTINGS\n# =============================================================================\n\nMAX_RETRIES = int(os.environ.get(\"PIPELINE_MAX_RETRIES\", \"3\"))\nRETRY_BACKOFF_BASE = int(os.environ.get(\"PIPELINE_RETRY_BACKOFF\", \"2\"))\n\n# Semantic analysis: sync (immediate) or async (queue)\nSEMANTIC_SYNC = os.environ.get(\"SEMANTIC_SYNC\", \"true\").lower() == \"true\"\nSEMANTIC_USE_ANTHROPIC = os.environ.get(\"SEMANTIC_USE_ANTHROPIC\", \"false\").lower() == \"true\"\nSEMANTIC_QUEUE_BATCH_SIZE = int(os.environ.get(\"SEMANTIC_QUEUE_BATCH_SIZE\", \"5\"))\n\n# =============================================================================\n# OCR SETTINGS\n# =============================================================================\n\nOCR_ENABLED = os.environ.get(\"OCR_ENABLED\", \"true\").lower() == \"true\"\nOCR_LANGUAGE = os.environ.get(\"OCR_LANGUAGE\", \"deu\") # tesseract language code\n\n# =============================================================================\n# LOGGING\n# =============================================================================\n\nLOG_LEVEL = os.environ.get(\"LOG_LEVEL\", \"INFO\")\n\n\n# =============================================================================\n# HELPER FUNCTIONS\n# =============================================================================\n\ndef get_db_password():\n \"\"\"Get database password from environment.\"\"\"\n return os.environ.get(\"DB_PASSWORD\", \"\")\n\n\ndef get_anthropic_key():\n \"\"\"Get Anthropic API key from environment.\"\"\"\n return os.environ.get(\"ANTHROPIC_API_KEY\", \"\")\n\n\ndef get_ollama_model_id(full_key: str) -> str:\n \"\"\"Extract Ollama model ID from full_key.\n\n Example: 'ollama:gemma3:27b-it-qat' -> 'gemma3:27b-it-qat'\n \"\"\"\n if full_key.startswith(\"ollama:\"):\n return full_key[7:]\n return full_key\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/migration\/content-pipeline\/config\/settings.py",
"content": "\"\"\"\nContent Pipeline - Abstracted Configuration\n\nAll settings are loaded from environment variables for portability.\nCopy settings.env.example to .env and adjust values.\n\"\"\"\n\nimport os\nfrom pathlib import Path\n\n# =============================================================================\n# PATHS\n# =============================================================================\n\n# Document source directory\nDOCUMENT_SOURCE_PATH = os.environ.get(\n \"PIPELINE_DOCUMENT_PATH\",\n \"\/path\/to\/your\/documents\"\n)\n\n# Supported file extensions\nSUPPORTED_EXTENSIONS = [\".pdf\", \".pptx\", \".docx\", \".md\", \".txt\"]\n\n# =============================================================================\n# DATABASE - Content (chunks, entities, semantic data)\n# =============================================================================\n\nDB_CONFIG = {\n \"host\": os.environ.get(\"DB_HOST\", \"localhost\"),\n \"database\": os.environ.get(\"DB_NAME\", \"content_pipeline\"),\n \"user\": os.environ.get(\"DB_USER\", \"pipeline\"),\n \"password\": os.environ.get(\"DB_PASSWORD\", \"\"),\n \"charset\": \"utf8mb4\",\n}\n\n# Optional: Separate logging database\nDB_LOG_CONFIG = {\n \"host\": os.environ.get(\"DB_LOG_HOST\", os.environ.get(\"DB_HOST\", \"localhost\")),\n \"database\": os.environ.get(\"DB_LOG_NAME\", os.environ.get(\"DB_NAME\", \"content_pipeline\")),\n \"user\": os.environ.get(\"DB_LOG_USER\", os.environ.get(\"DB_USER\", \"pipeline\")),\n \"password\": os.environ.get(\"DB_LOG_PASSWORD\", os.environ.get(\"DB_PASSWORD\", \"\")),\n \"charset\": \"utf8mb4\",\n}\n\n# =============================================================================\n# QDRANT VECTOR DATABASE\n# =============================================================================\n\nQDRANT_HOST = os.environ.get(\"QDRANT_HOST\", \"localhost\")\nQDRANT_PORT = int(os.environ.get(\"QDRANT_PORT\", \"6333\"))\n\n# Collection configuration\n# Format: {collection_name: {size: embedding_dim, distance: metric}}\nQDRANT_COLLECTIONS = {\n os.environ.get(\"QDRANT_COLLECTION_DOCUMENTS\", \"documents\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n os.environ.get(\"QDRANT_COLLECTION_ENTITIES\", \"entities\"): {\n \"size\": 1024,\n \"distance\": \"Cosine\"\n },\n}\n\n# =============================================================================\n# OLLAMA - Local LLM (Embeddings & Chat)\n# =============================================================================\n\nOLLAMA_HOST = os.environ.get(\"OLLAMA_HOST\", \"http:\/\/localhost:11434\")\n\n# Embedding model - must output vectors matching QDRANT collection size\nEMBEDDING_MODEL = os.environ.get(\"OLLAMA_EMBEDDING_MODEL\", \"mxbai-embed-large\")\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", \"1024\"))\n\n# Chat\/Generation model for analysis\nOLLAMA_CHAT_MODEL = os.environ.get(\"OLLAMA_CHAT_MODEL\", \"llama3.2:3b\")\n\n# =============================================================================\n# ANTHROPIC API (Optional - for higher quality analysis)\n# =============================================================================\n\nANTHROPIC_API_KEY = os.environ.get(\"ANTHROPIC_API_KEY\", None)\nANTHROPIC_MODEL = os.environ.get(\"ANTHROPIC_MODEL\", \"claude-sonnet-4-20250514\")\n\n# =============================================================================\n# CHUNKING SETTINGS\n# =============================================================================\n\nCHUNK_OVERLAP_PERCENT = int(os.environ.get(\"CHUNK_OVERLAP_PERCENT\", \"10\"))\nMIN_CHUNK_SIZE = int(os.environ.get(\"MIN_CHUNK_SIZE\", \"100\"))\nMAX_CHUNK_SIZE = int(os.environ.get(\"MAX_CHUNK_SIZE\", \"2000\"))\n\n# =============================================================================\n# PIPELINE SETTINGS\n# =============================================================================\n\nMAX_RETRIES = int(os.environ.get(\"PIPELINE_MAX_RETRIES\", \"3\"))\nRETRY_BACKOFF_BASE = int(os.environ.get(\"PIPELINE_RETRY_BACKOFF\", \"2\"))\n\n# Semantic analysis: sync (immediate) or async (queue)\nSEMANTIC_SYNC = os.environ.get(\"SEMANTIC_SYNC\", \"true\").lower() == \"true\"\nSEMANTIC_USE_ANTHROPIC = os.environ.get(\"SEMANTIC_USE_ANTHROPIC\", \"false\").lower() == \"true\"\nSEMANTIC_QUEUE_BATCH_SIZE = int(os.environ.get(\"SEMANTIC_QUEUE_BATCH_SIZE\", \"5\"))\n\n# =============================================================================\n# OCR SETTINGS\n# =============================================================================\n\nOCR_ENABLED = os.environ.get(\"OCR_ENABLED\", \"true\").lower() == \"true\"\nOCR_LANGUAGE = os.environ.get(\"OCR_LANGUAGE\", \"deu\") # tesseract language code\n\n# =============================================================================\n# LOGGING\n# =============================================================================\n\nLOG_LEVEL = os.environ.get(\"LOG_LEVEL\", \"INFO\")\n\n\n# =============================================================================\n# HELPER FUNCTIONS\n# =============================================================================\n\ndef get_db_password():\n \"\"\"Get database password from environment.\"\"\"\n return os.environ.get(\"DB_PASSWORD\", \"\")\n\n\ndef get_anthropic_key():\n \"\"\"Get Anthropic API key from environment.\"\"\"\n return os.environ.get(\"ANTHROPIC_API_KEY\", \"\")\n\n\ndef get_ollama_model_id(full_key: str) -> str:\n \"\"\"Extract Ollama model ID from full_key.\n\n Example: 'ollama:gemma3:27b-it-qat' -> 'gemma3:27b-it-qat'\n \"\"\"\n if full_key.startswith(\"ollama:\"):\n return full_key[7:]\n return full_key\n",
"structuredPatch": [],
"originalFile": null
}
}