Protokoll #25041

ID	25041
Zeitstempel	2025-12-28 02:01:05.060702
Client	root
IP	145.224.72.140
Modell	claude-sonnet-4-20250514
Status	completed
Tokens	2,692 (Input: 28, Output: 2,664)
Dauer	97 ms
Request-Zeit	2025-12-28 02:01:05.060702
Response-Zeit	2025-12-28 02:01:05.157290

Request

{
    "event": "PreToolUse",
    "tool_name": "Read",
    "tool_input": {
        "file_path": "\/var\/www\/scripts\/pipeline\/pipeline.py"
    }
}

Response

{
    "tool_response": {
        "type": "text",
        "file": {
            "filePath": "\/var\/www\/scripts\/pipeline\/pipeline.py",
            "content": "#!\/usr\/bin\/env python3\n\"\"\"\nKI-System Document Pipeline\nMain orchestration script for document processing.\n\nUsage:\n    python pipeline.py scan             # Scan for new documents\n    python pipeline.py process          # Process queued documents\n    python pipeline.py embed            # Embed pending chunks\n    python pipeline.py semantic <id>    # Run semantic analysis on document\n    python pipeline.py semantic-queue   # Process semantic queue\n    python pipeline.py enrich-entities  # Enrich entity descriptions via Ollama\n    python pipeline.py enrich-entities 50  # Limit to 50 entities\n    python pipeline.py all              # Full pipeline run\n    python pipeline.py all --pipeline-id=1 --run-id=5  # With tracking\n    python pipeline.py file <path>      # Process single file\n    python pipeline.py status           # Show pipeline status\n\"\"\"\n\nimport argparse\nimport os\nimport time\nfrom pathlib import Path\n\nfrom config import (\n    MAX_RETRIES,\n    RETRY_BACKOFF_BASE,\n    SEMANTIC_AUTO_QUEUE,\n    SEMANTIC_SYNC,\n    SEMANTIC_USE_ANTHROPIC,\n)\nfrom constants import DEFAULT_LIMIT\nfrom db import PipelineProgress, db\nfrom detect import queue_files, scan_directory\nfrom step_embed import EmbeddingStep\nfrom step_entity_enrich import EntityEnrichStep\nfrom step_extract import ExtractionStep\nfrom step_load import LoadStep\nfrom step_semantic import SemanticStep\nfrom step_transform import TransformationStep\n\n\ndef process_file(file_path, progress=None):\n    \"\"\"Process a single file through the pipeline.\"\"\"\n    file_name = Path(file_path).name\n\n    if progress:\n        progress.update_document(file_name)\n\n    # Initialize pipeline steps\n    extract_step = ExtractionStep(db, progress)\n    load_step = LoadStep(db, progress)\n    transform_step = TransformationStep(db, progress)\n    embed_step = EmbeddingStep(db, progress)\n\n    # Check if cancelled before starting\n    if progress and progress.is_cancelled():\n        return \"cancelled\", 0, 0\n\n    # Step 1: Extract\n    extract_result = extract_step.execute(file_path)\n    if not extract_result[\"success\"]:\n        if extract_result.get(\"error\") == \"cancelled\":\n            return \"cancelled\", 0, 0\n        return False, 0, 0\n\n    extraction = extract_result[\"extraction\"]\n    file_info = extract_result[\"file_info\"]\n    total_pages = extract_result.get(\"total_pages\", 0)\n\n    # Check if cancelled after extraction\n    if progress and progress.is_cancelled():\n        return \"cancelled\", 0, 0\n\n    # Step 2: Load document\n    doc_id = load_step.create_document(file_info)\n\n    # Step 3: Store pages (PDFs and multi-page documents)\n    page_map = load_step.store_pages(doc_id, extraction)\n\n    # Step 4: Vision analysis (PDFs only)\n    if file_info[\"type\"] == \".pdf\":\n        transform_step.execute_vision(doc_id, file_path, file_info[\"type\"])\n\n        # Check if cancelled after vision\n        if progress and progress.is_cancelled():\n            return \"cancelled\", 0, 0\n\n    # Step 5: Chunking\n    chunks = transform_step.execute_chunking(extraction, total_pages)\n\n    # Step 6: Store chunks with page references\n    chunks = load_step.store_chunks(doc_id, chunks, page_map)\n\n    # Check if cancelled after chunking\n    if progress and progress.is_cancelled():\n        return \"cancelled\", len(chunks), 0\n\n    # Step 7: Enrichment (PDFs only)\n    if file_info[\"type\"] == \".pdf\":\n        transform_step.execute_enrichment(doc_id, file_info[\"type\"])\n\n        # Check if cancelled after enrichment\n        if progress and progress.is_cancelled():\n            return \"cancelled\", len(chunks), 0\n\n    # Step 8: Embeddings (Layer 3 - Document becomes searchable)\n    embedded = embed_step.execute(chunks, doc_id, file_name, file_path)\n\n    # Document is now searchable - update status to \"embedded\"\n    load_step.update_document_status(doc_id, \"embedded\")\n\n    if progress:\n        progress.add_log(f\"Layer 3 fertig: {file_name} ist jetzt suchbar\")\n\n    # Check if cancelled after embedding\n    if progress and progress.is_cancelled():\n        return \"cancelled\", len(chunks), embedded\n\n    # Step 9: Semantic analysis (Layer 4 - Optional\/Async)\n    semantic_step = SemanticStep(db, progress)\n    full_text = extract_step.get_full_text_from_extraction(extraction)\n\n    if SEMANTIC_SYNC:\n        # Run semantic analysis synchronously\n        try:\n            semantic_step.execute(doc_id, full_text, use_anthropic=SEMANTIC_USE_ANTHROPIC)\n            # Update to done only after semantic completes\n            load_step.update_document_status(doc_id, \"done\")\n        except Exception as e:\n            # Semantic failed but document is still searchable\n            db.log(\"WARNING\", f\"Semantic analysis failed for {file_name}: {e}\")\n            if progress:\n                progress.add_log(f\"Semantik-Fehler (Dokument bleibt suchbar): {str(e)[:50]}\")\n    elif SEMANTIC_AUTO_QUEUE:\n        # Queue for async processing\n        semantic_step.queue(doc_id, priority=5)\n        load_step.update_document_status(doc_id, \"done\")\n        if progress:\n            progress.add_log(f\"Semantik in Queue: {file_name}\")\n    else:\n        # No semantic analysis\n        load_step.update_document_status(doc_id, \"done\")\n\n    if progress:\n        progress.add_log(f\"Fertig: {file_name}\")\n\n    return True, len(chunks), embedded\n\n\ndef process_queue():\n    \"\"\"Process items from the queue.\"\"\"\n    items = db.get_pending_queue_items(limit=10)\n    db.log(\"INFO\", f\"Found {len(items)} items in queue\")\n\n    for item in items:\n        queue_id = item[\"id\"]\n        file_path = item[\"file_path\"]\n        retry_count = item[\"retry_count\"]\n\n        if retry_count >= MAX_RETRIES:\n            db.update_queue_status(queue_id, \"failed\", \"Max retries exceeded\")\n            continue\n\n        db.update_queue_status(queue_id, \"processing\")\n\n        try:\n            success = process_file(file_path)\n            if success:\n                db.update_queue_status(queue_id, \"completed\")\n            else:\n                raise Exception(\"Processing returned False\")\n        except Exception as e:\n            error_msg = str(e)\n            db.update_queue_status(queue_id, \"pending\", error_msg)\n\n            # Exponential backoff\n            wait_time = RETRY_BACKOFF_BASE ** (retry_count + 1)\n            db.log(\"INFO\", f\"Retry {retry_count + 1} in {wait_time}s: {file_path}\")\n            time.sleep(wait_time)\n\n\ndef run_scan():\n    \"\"\"Scan for new documents.\"\"\"\n    files = scan_directory()\n    print(f\"Found {len(files)} files\")\n\n    if files:\n        queued = queue_files(files)\n        print(f\"Queued {queued} files\")\n\n    return files\n\n\ndef run_full_pipeline(run_id=None, pipeline_id=None):\n    \"\"\"Run complete pipeline: scan → process → embed.\"\"\"\n    progress = PipelineProgress(run_id) if run_id else None\n\n    print(\"=\" * 50)\n    print(\"KI-System Pipeline - Full Run\")\n    if run_id:\n        print(f\"Run ID: {run_id}, Pipeline ID: {pipeline_id}\")\n    print(\"=\" * 50)\n\n    try:\n        # Phase 1: Scan\n        if progress:\n            progress.update_step(\"detect\")\n            progress.add_log(\"Scanne nach Dokumenten...\")\n\n        print(\"\\n[1\/3] Scanning for documents...\")\n        files = scan_directory()\n        print(f\"Found {len(files)} files\")\n\n        if progress:\n            progress.add_log(f\"{len(files)} neue Dokumente gefunden\")\n\n        if files:\n            queued = queue_files(files)\n            print(f\"Queued {queued} files\")\n\n        # Phase 2: Process queue items (includes resume of previous runs)\n        items = db.get_pending_queue_items(limit=DEFAULT_LIMIT)\n        print(f\"\\n[2\/3] Processing {len(items)} documents...\")\n\n        if items:\n            # Update total with actual queue count (may include items from previous runs)\n            if progress:\n                progress.update_progress(total=len(items))\n                progress.add_log(f\"{len(items)} Dokumente in Queue\")\n\n            total_chunks = 0\n            total_embeddings = 0\n            processed = 0\n            failed = 0\n\n            for item in items:\n                # Check if cancelled\n                if progress and progress.is_cancelled():\n                    progress.add_log(\"Pipeline abgebrochen durch Benutzer\")\n                    progress.complete(\"cancelled\")\n                    print(\"\\nPipeline cancelled by user\")\n                    return\n\n                queue_id = item[\"id\"]\n                file_path = item[\"file_path\"]\n                file_name = Path(file_path).name\n\n                # Skip already-done documents (for resume capability)\n                if db.document_is_done(file_path):\n                    db.update_queue_status(queue_id, \"completed\")\n                    processed += 1\n                    if progress:\n                        progress.add_log(f\"Übersprungen (bereits fertig): {file_name}\")\n                        progress.update_progress(processed=processed)\n                    continue\n\n                if progress:\n                    progress.update_document(file_name)\n\n                db.update_queue_status(queue_id, \"processing\")\n\n                try:\n                    result = process_file(file_path, progress)\n                    success, chunks, embedded = result if isinstance(result, tuple) else (result, 0, 0)\n\n                    # Handle cancellation during file processing\n                    if success == \"cancelled\":\n                        progress.add_log(\"Pipeline abgebrochen durch Benutzer\")\n                        progress.complete(\"cancelled\")\n                        print(\"\\nPipeline cancelled by user\")\n                        return\n\n                    if success:\n                        db.update_queue_status(queue_id, \"completed\")\n                        processed += 1\n                        total_chunks += chunks\n                        total_embeddings += embedded\n                    else:\n                        db.update_queue_status(queue_id, \"failed\", \"Processing failed\")\n                        failed += 1\n                except Exception as e:\n                ... [TRUNCATED-a052b02da8d7633a]",
            "numLines": 489,
            "startLine": 1,
            "totalLines": 489
        }
    }
}

← Vorheriger Zur Liste Nächster →