Protokoll #23915
| ID | 23915 |
|---|---|
| Zeitstempel | 2025-12-27 15:33:35.862873 |
| Client | root |
| IP | Warning: Undefined array key "request_ip" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13 Deprecated: htmlspecialchars(): Passing null to parameter #1 ($string) of type string is deprecated in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13 |
| Modell | claude-sonnet-4-20250514 |
| Status | completed |
| Tokens | 2,035 (Input: 0, Output: 0) |
| Dauer | 110 ms |
| Request-Zeit | Warning: Undefined array key "request_timestamp" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 18 |
| Response-Zeit | - |
Warning: Undefined array key "error_message" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 22
Request
{
"event": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/embed.py",
"old_string": " \"content\": chunk[\"content\"][:1000], # Truncate for payload",
"new_string": " \"content\": chunk[\"content\"][:BATCH_LIMIT], # Truncate for payload"
}
}
Response
{
"tool_response": {
"filePath": "\/var\/www\/scripts\/pipeline\/embed.py",
"oldString": " \"content\": chunk[\"content\"][:1000], # Truncate for payload",
"newString": " \"content\": chunk[\"content\"][:BATCH_LIMIT], # Truncate for payload",
"originalFile": "\"\"\"\nEmbedding generation for KI-System Pipeline\nUses Ollama (mxbai-embed-large) for vector embeddings.\n\"\"\"\n\nimport json\nimport re\nimport uuid\n\nimport requests\n\nfrom config import EMBEDDING_DIMENSION, EMBEDDING_MODEL, OLLAMA_HOST, QDRANT_HOST, QDRANT_PORT\nfrom constants import BATCH_LIMIT, DEFAULT_LIMIT, OLLAMA_TIMEOUT\nfrom db import db\n\n# Max chars for mxbai-embed model (512 token context, varies by content)\n# Conservative limit to handle German compound words and special chars\nMAX_EMBED_CHARS = 800\n\n\ndef get_embedding(text):\n \"\"\"Get embedding vector from Ollama.\"\"\"\n # Skip empty content\n if not text or not text.strip():\n return None\n\n # Collapse consecutive dots\/periods (table of contents, etc.)\n text = re.sub(r\"\\.{3,}\", \"...\", text)\n\n # Truncate if too long for model context\n if len(text) > MAX_EMBED_CHARS:\n text = text[:MAX_EMBED_CHARS]\n\n try:\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/embeddings\",\n json={\"model\": EMBEDDING_MODEL, \"prompt\": text},\n timeout=OLLAMA_TIMEOUT,\n )\n response.raise_for_status()\n data = response.json()\n return data.get(\"embedding\")\n except Exception as e:\n db.log(\"ERROR\", f\"Embedding generation failed: {e}\")\n return None\n\n\ndef store_in_qdrant(collection, point_id, vector, payload):\n \"\"\"Store embedding in Qdrant.\"\"\"\n try:\n response = requests.put(\n f\"http:\/\/{QDRANT_HOST}:{QDRANT_PORT}\/collections\/{collection}\/points\",\n json={\"points\": [{\"id\": point_id, \"vector\": vector, \"payload\": payload}]},\n headers={\"Content-Type\": \"application\/json\"},\n timeout=30,\n )\n response.raise_for_status()\n return True\n except Exception as e:\n db.log(\"ERROR\", f\"Qdrant storage failed: {e}\")\n return False\n\n\ndef embed_chunks(chunks, document_id, document_title, source_path, progress=None):\n \"\"\"\n Generate embeddings for chunks and store in Qdrant.\n Returns number of successfully embedded chunks.\n \"\"\"\n embedded_count = 0\n total_chunks = len(chunks)\n\n for i, chunk in enumerate(chunks):\n # Log every chunk for full visibility\n if progress:\n progress.add_log(f\"Embed: Chunk {i + 1}\/{total_chunks}...\")\n\n # Generate embedding\n embedding = get_embedding(chunk[\"content\"])\n\n if not embedding:\n db.log(\"WARNING\", f\"Failed to embed chunk {i} of document {document_id}\")\n continue\n\n # Verify dimension\n if len(embedding) != EMBEDDING_DIMENSION:\n db.log(\"ERROR\", f\"Wrong embedding dimension: {len(embedding)} vs {EMBEDDING_DIMENSION}\")\n continue\n\n # Generate UUID for Qdrant\n point_id = str(uuid.uuid4())\n\n # Prepare payload\n payload = {\n \"document_id\": document_id,\n \"document_title\": document_title,\n \"chunk_index\": i,\n \"content\": chunk[\"content\"][:1000], # Truncate for payload\n \"heading_path\": json.dumps(chunk.get(\"heading_path\", [])),\n \"source_path\": source_path,\n }\n\n # Add any chunk metadata\n if chunk.get(\"metadata\"):\n for key, value in chunk[\"metadata\"].items():\n if isinstance(value, (str, int, float, bool)):\n payload[key] = value\n\n # Store in Qdrant\n if store_in_qdrant(\"documents\", point_id, embedding, payload):\n # Update chunk in database with Qdrant ID\n db.update_chunk_qdrant_id(chunk.get(\"db_id\"), point_id)\n embedded_count += 1\n db.log(\"INFO\", f\"Embedded chunk {i}\/{len(chunks)}\", f\"doc={document_id}\")\n else:\n db.log(\"ERROR\", f\"Failed to store chunk {i} in Qdrant\")\n\n return embedded_count\n\n\ndef embed_pending_chunks(limit=100):\n \"\"\"Process chunks that haven't been embedded yet.\"\"\"\n db.connect()\n\n try:\n chunks = db.get_chunks_for_embedding(limit)\n db.log(\"INFO\", f\"Found {len(chunks)} chunks to embed\")\n\n if not chunks:\n return 0\n\n embedded = 0\n for chunk in chunks:\n embedding = get_embedding(chunk[\"content\"])\n\n if not embedding:\n continue\n\n point_id = str(uuid.uuid4())\n\n # Get document info\n cursor = db.execute(\"SELECT filename, source_path FROM documents WHERE id = %s\", (chunk[\"document_id\"],))\n doc = cursor.fetchone()\n cursor.close()\n\n payload = {\n \"document_id\": chunk[\"document_id\"],\n \"document_title\": doc[\"filename\"] if doc else \"\",\n \"chunk_id\": chunk[\"id\"],\n \"content\": chunk[\"content\"][:1000],\n \"source_path\": doc[\"source_path\"] if doc else \"\",\n }\n\n if store_in_qdrant(\"documents\", point_id, embedding, payload):\n db.update_chunk_qdrant_id(chunk[\"id\"], point_id)\n embedded += 1\n\n db.log(\"INFO\", f\"Embedded {embedded}\/{len(chunks)} chunks\")\n return embedded\n\n except Exception as e:\n db.log(\"ERROR\", f\"Embedding error: {e}\")\n raise\n finally:\n db.disconnect()\n\n\ndef search_similar(query, collection=\"documents\", limit=5):\n \"\"\"Search for similar documents in Qdrant.\"\"\"\n # Get query embedding\n embedding = get_embedding(query)\n if not embedding:\n return []\n\n try:\n response = requests.post(\n f\"http:\/\/{QDRANT_HOST}:{QDRANT_PORT}\/collections\/{collection}\/points\/search\",\n json={\"vector\": embedding, \"limit\": limit, \"with_payload\": True},\n headers={\"Content-Type\": \"application\/json\"},\n timeout=30,\n )\n response.raise_for_status()\n data = response.json()\n return data.get(\"result\", [])\n except Exception as e:\n db.log(\"ERROR\", f\"Qdrant search failed: {e}\")\n return []\n\n\nif __name__ == \"__main__\":\n import sys\n\n if len(sys.argv) > 1:\n query = \" \".join(sys.argv[1:])\n print(f\"Searching for: {query}\")\n print(\"-\" * 50)\n\n results = search_similar(query)\n for i, result in enumerate(results):\n print(f\"\\n[{i + 1}] Score: {result['score']:.4f}\")\n print(f\" Document: {result['payload'].get('document_title', 'Unknown')}\")\n print(f\" Content: {result['payload'].get('content', '')[:200]}...\")\n else:\n # Run pending embeddings\n count = embed_pending_chunks()\n print(f\"Embedded {count} chunks\")\n",
"structuredPatch": [
{
"oldStart": 94,
"oldLines": 7,
"newStart": 94,
"newLines": 7,
"lines": [
" \"document_id\": document_id,",
" \"document_title\": document_title,",
" \"chunk_index\": i,",
"- \"content\": chunk[\"content\"][:1000], # Truncate for payload",
"+ \"content\": chunk[\"content\"][:BATCH_LIMIT], # Truncate for payload",
" \"heading_path\": json.dumps(chunk.get(\"heading_path\", [])),",
" \"source_path\": source_path,",
" }"
]
}
],
"userModified": false,
"replaceAll": false
}
}