Protokoll #23915

ID	23915
Zeitstempel	2025-12-27 15:33:35.862873
Client	root
IP	Warning: Undefined array key "request_ip" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13 Deprecated: htmlspecialchars(): Passing null to parameter #1 ($string) of type string is deprecated in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13
Modell	claude-sonnet-4-20250514
Status	completed
Tokens	2,035 (Input: 0, Output: 0)
Dauer	110 ms
Request-Zeit	Warning: Undefined array key "request_timestamp" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 18
Response-Zeit	-

Warning: Undefined array key "error_message" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 22

Request

{
    "event": "PreToolUse",
    "tool_name": "Edit",
    "tool_input": {
        "file_path": "\/var\/www\/scripts\/pipeline\/embed.py",
        "old_string": "            \"content\": chunk[\"content\"][:1000],  # Truncate for payload",
        "new_string": "            \"content\": chunk[\"content\"][:BATCH_LIMIT],  # Truncate for payload"
    }
}

Response

{
    "tool_response": {
        "filePath": "\/var\/www\/scripts\/pipeline\/embed.py",
        "oldString": "            \"content\": chunk[\"content\"][:1000],  # Truncate for payload",
        "newString": "            \"content\": chunk[\"content\"][:BATCH_LIMIT],  # Truncate for payload",
        "originalFile": "\"\"\"\nEmbedding generation for KI-System Pipeline\nUses Ollama (mxbai-embed-large) for vector embeddings.\n\"\"\"\n\nimport json\nimport re\nimport uuid\n\nimport requests\n\nfrom config import EMBEDDING_DIMENSION, EMBEDDING_MODEL, OLLAMA_HOST, QDRANT_HOST, QDRANT_PORT\nfrom constants import BATCH_LIMIT, DEFAULT_LIMIT, OLLAMA_TIMEOUT\nfrom db import db\n\n# Max chars for mxbai-embed model (512 token context, varies by content)\n# Conservative limit to handle German compound words and special chars\nMAX_EMBED_CHARS = 800\n\n\ndef get_embedding(text):\n    \"\"\"Get embedding vector from Ollama.\"\"\"\n    # Skip empty content\n    if not text or not text.strip():\n        return None\n\n    # Collapse consecutive dots\/periods (table of contents, etc.)\n    text = re.sub(r\"\\.{3,}\", \"...\", text)\n\n    # Truncate if too long for model context\n    if len(text) > MAX_EMBED_CHARS:\n        text = text[:MAX_EMBED_CHARS]\n\n    try:\n        response = requests.post(\n            f\"{OLLAMA_HOST}\/api\/embeddings\",\n            json={\"model\": EMBEDDING_MODEL, \"prompt\": text},\n            timeout=OLLAMA_TIMEOUT,\n        )\n        response.raise_for_status()\n        data = response.json()\n        return data.get(\"embedding\")\n    except Exception as e:\n        db.log(\"ERROR\", f\"Embedding generation failed: {e}\")\n        return None\n\n\ndef store_in_qdrant(collection, point_id, vector, payload):\n    \"\"\"Store embedding in Qdrant.\"\"\"\n    try:\n        response = requests.put(\n            f\"http:\/\/{QDRANT_HOST}:{QDRANT_PORT}\/collections\/{collection}\/points\",\n            json={\"points\": [{\"id\": point_id, \"vector\": vector, \"payload\": payload}]},\n            headers={\"Content-Type\": \"application\/json\"},\n            timeout=30,\n        )\n        response.raise_for_status()\n        return True\n    except Exception as e:\n        db.log(\"ERROR\", f\"Qdrant storage failed: {e}\")\n        return False\n\n\ndef embed_chunks(chunks, document_id, document_title, source_path, progress=None):\n    \"\"\"\n    Generate embeddings for chunks and store in Qdrant.\n    Returns number of successfully embedded chunks.\n    \"\"\"\n    embedded_count = 0\n    total_chunks = len(chunks)\n\n    for i, chunk in enumerate(chunks):\n        # Log every chunk for full visibility\n        if progress:\n            progress.add_log(f\"Embed: Chunk {i + 1}\/{total_chunks}...\")\n\n        # Generate embedding\n        embedding = get_embedding(chunk[\"content\"])\n\n        if not embedding:\n            db.log(\"WARNING\", f\"Failed to embed chunk {i} of document {document_id}\")\n            continue\n\n        # Verify dimension\n        if len(embedding) != EMBEDDING_DIMENSION:\n            db.log(\"ERROR\", f\"Wrong embedding dimension: {len(embedding)} vs {EMBEDDING_DIMENSION}\")\n            continue\n\n        # Generate UUID for Qdrant\n        point_id = str(uuid.uuid4())\n\n        # Prepare payload\n        payload = {\n            \"document_id\": document_id,\n            \"document_title\": document_title,\n            \"chunk_index\": i,\n            \"content\": chunk[\"content\"][:1000],  # Truncate for payload\n            \"heading_path\": json.dumps(chunk.get(\"heading_path\", [])),\n            \"source_path\": source_path,\n        }\n\n        # Add any chunk metadata\n        if chunk.get(\"metadata\"):\n            for key, value in chunk[\"metadata\"].items():\n                if isinstance(value, (str, int, float, bool)):\n                    payload[key] = value\n\n        # Store in Qdrant\n        if store_in_qdrant(\"documents\", point_id, embedding, payload):\n            # Update chunk in database with Qdrant ID\n            db.update_chunk_qdrant_id(chunk.get(\"db_id\"), point_id)\n            embedded_count += 1\n            db.log(\"INFO\", f\"Embedded chunk {i}\/{len(chunks)}\", f\"doc={document_id}\")\n        else:\n            db.log(\"ERROR\", f\"Failed to store chunk {i} in Qdrant\")\n\n    return embedded_count\n\n\ndef embed_pending_chunks(limit=100):\n    \"\"\"Process chunks that haven't been embedded yet.\"\"\"\n    db.connect()\n\n    try:\n        chunks = db.get_chunks_for_embedding(limit)\n        db.log(\"INFO\", f\"Found {len(chunks)} chunks to embed\")\n\n        if not chunks:\n            return 0\n\n        embedded = 0\n        for chunk in chunks:\n            embedding = get_embedding(chunk[\"content\"])\n\n            if not embedding:\n                continue\n\n            point_id = str(uuid.uuid4())\n\n            # Get document info\n            cursor = db.execute(\"SELECT filename, source_path FROM documents WHERE id = %s\", (chunk[\"document_id\"],))\n            doc = cursor.fetchone()\n            cursor.close()\n\n            payload = {\n                \"document_id\": chunk[\"document_id\"],\n                \"document_title\": doc[\"filename\"] if doc else \"\",\n                \"chunk_id\": chunk[\"id\"],\n                \"content\": chunk[\"content\"][:1000],\n                \"source_path\": doc[\"source_path\"] if doc else \"\",\n            }\n\n            if store_in_qdrant(\"documents\", point_id, embedding, payload):\n                db.update_chunk_qdrant_id(chunk[\"id\"], point_id)\n                embedded += 1\n\n        db.log(\"INFO\", f\"Embedded {embedded}\/{len(chunks)} chunks\")\n        return embedded\n\n    except Exception as e:\n        db.log(\"ERROR\", f\"Embedding error: {e}\")\n        raise\n    finally:\n        db.disconnect()\n\n\ndef search_similar(query, collection=\"documents\", limit=5):\n    \"\"\"Search for similar documents in Qdrant.\"\"\"\n    # Get query embedding\n    embedding = get_embedding(query)\n    if not embedding:\n        return []\n\n    try:\n        response = requests.post(\n            f\"http:\/\/{QDRANT_HOST}:{QDRANT_PORT}\/collections\/{collection}\/points\/search\",\n            json={\"vector\": embedding, \"limit\": limit, \"with_payload\": True},\n            headers={\"Content-Type\": \"application\/json\"},\n            timeout=30,\n        )\n        response.raise_for_status()\n        data = response.json()\n        return data.get(\"result\", [])\n    except Exception as e:\n        db.log(\"ERROR\", f\"Qdrant search failed: {e}\")\n        return []\n\n\nif __name__ == \"__main__\":\n    import sys\n\n    if len(sys.argv) > 1:\n        query = \" \".join(sys.argv[1:])\n        print(f\"Searching for: {query}\")\n        print(\"-\" * 50)\n\n        results = search_similar(query)\n        for i, result in enumerate(results):\n            print(f\"\\n[{i + 1}] Score: {result['score']:.4f}\")\n            print(f\"    Document: {result['payload'].get('document_title', 'Unknown')}\")\n            print(f\"    Content: {result['payload'].get('content', '')[:200]}...\")\n    else:\n        # Run pending embeddings\n        count = embed_pending_chunks()\n        print(f\"Embedded {count} chunks\")\n",
        "structuredPatch": [
            {
                "oldStart": 94,
                "oldLines": 7,
                "newStart": 94,
                "newLines": 7,
                "lines": [
                    "             \"document_id\": document_id,",
                    "             \"document_title\": document_title,",
                    "             \"chunk_index\": i,",
                    "-            \"content\": chunk[\"content\"][:1000],  # Truncate for payload",
                    "+            \"content\": chunk[\"content\"][:BATCH_LIMIT],  # Truncate for payload",
                    "             \"heading_path\": json.dumps(chunk.get(\"heading_path\", [])),",
                    "             \"source_path\": source_path,",
                    "         }"
                ]
            }
        ],
        "userModified": false,
        "replaceAll": false
    }
}

← Vorheriger Zur Liste Nächster →