Protokoll #18890

ID	18890
Zeitstempel	2025-12-26 22:39:48.454933
Client	root
IP	145.224.72.140
Modell	claude-sonnet-4-20250514
Status	completed
Tokens	5,373 (Input: 1,421, Output: 3,952)
Dauer	87 ms
Request-Zeit	2025-12-26 22:39:48.454933
Response-Zeit	2025-12-26 22:39:48.541970

Request

{
    "event": "PreToolUse",
    "tool_name": "Write",
    "tool_input": {
        "file_path": "\/var\/www\/scripts\/pipeline\/run_demo.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\nUses existing db.py methods for correct schema handling.\n\"\"\"\n\nimport json\nimport os\nimport sys\nimport time\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n    print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n    log(\"=== START ===\")\n\n    # 1. DB Connect\n    log(\"1. DB verbinden...\")\n    start = time.time()\n    db.connect()\n    log(f\"   OK ({time.time()-start:.1f}s)\")\n\n    # 2. DB Reset\n    log(\"2. DB Reset (Tabellen leeren)...\")\n    start = time.time()\n    tables_order = [\n        \"entity_relations\",\n        \"chunk_entities\",\n        \"document_entities\",\n        \"chunk_semantics\",\n        \"chunk_taxonomy\",\n        \"document_taxonomy\",\n        \"document_pages\",\n        \"entities\",\n        \"chunks\",\n        \"documents\",\n    ]\n    for table in tables_order:\n        try:\n            db.execute(f\"DELETE FROM {table}\")\n            db.commit()\n            log(f\"   {table}: OK\")\n        except Exception as e:\n            log(f\"   {table}: skip ({e})\")\n    log(f\"   DB Reset done ({time.time()-start:.1f}s)\")\n\n    # 3. Qdrant Reset\n    log(\"3. Qdrant Reset...\")\n    start = time.time()\n    try:\n        resp = requests.post(\n            \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",\n            json={\"filter\": {\"must\": []}},\n            timeout=10\n        )\n        log(f\"   Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")\n    except Exception as e:\n        log(f\"   Qdrant: {e}\")\n\n    # 4. PDF laden\n    log(\"4. PDF laden...\")\n    start = time.time()\n    from extract import extract_pdf\n    file_path = \"\/var\/www\/nextcloud\/data\/root\/files\/Documents\/demo.pdf\"\n    pages = extract_pdf(file_path)\n    text = \"\\n\\n\".join(p[\"text\"] for p in pages)\n    log(f\"   OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")\n\n    # 5. Document erstellen (nutze db.create_document)\n    log(\"5. Document in DB erstellen...\")\n    start = time.time()\n    doc_id = db.create_document(\n        file_path=file_path,\n        title=os.path.basename(file_path),\n        file_type=\"application\/pdf\",\n        file_size=os.path.getsize(file_path),\n        file_hash=\"demo_test\"\n    )\n    log(f\"   OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")\n\n    # 6. Chunking\n    log(\"6. Text chunken...\")\n    start = time.time()\n    from chunk import chunk_pdf\n    chunks = chunk_pdf(pages)\n    log(f\"   OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")\n\n    # 7. Chunks speichern (nutze db.insert_chunk)\n    log(\"7. Chunks in DB speichern...\")\n    start = time.time()\n    chunk_ids = []\n    chunk_texts = []\n    for i, chunk in enumerate(chunks):\n        content = chunk[\"content\"]\n        heading_path = json.dumps(chunk.get(\"heading_path\", []))\n        metadata = json.dumps(chunk.get(\"metadata\", {}))\n\n        chunk_id = db.insert_chunk(\n            doc_id=doc_id,\n            chunk_index=i,\n            content=content,\n            heading_path=heading_path,\n            metadata=metadata\n        )\n        chunk_ids.append(chunk_id)\n        chunk_texts.append(content)\n        log(f\"   Chunk {i+1}: {len(content)} chars -> id={chunk_id}\")\n    log(f\"   OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")\n\n    # 8. YAML Prompt laden\n    log(\"8. YAML Prompt aus DB laden...\")\n    start = time.time()\n    prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n    if prompt_data:\n        log(f\"   OK: Prompt geladen ({time.time()-start:.1f}s)\")\n        sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE\")\n        log(f\"   Prompt-Preview:\\n{sample_prompt[:400]}...\")\n    else:\n        log(\"   WARNUNG: Kein Prompt gefunden!\")\n\n    # 9. Entity Extraction pro Chunk\n    log(\"9. Entity Extraction (Ollama)...\")\n    all_entities = []\n    for i, (chunk_id, chunk_text) in enumerate(zip(chunk_ids, chunk_texts)):\n        log(f\"   Chunk {i+1}\/{len(chunk_texts)}: {len(chunk_text)} chars...\")\n        start = time.time()\n        entities = extract_entities_ollama(chunk_text)\n        log(f\"      -> {len(entities)} entities ({time.time()-start:.1f}s)\")\n        for e in entities[:5]:\n            log(f\"         - {e['name']} ({e['type']})\")\n        if len(entities) > 5:\n            log(f\"         ... und {len(entities)-5} weitere\")\n        all_entities.extend(entities)\n\n    # 10. Entities speichern (nutze store_entities)\n    log(\"10. Entities in DB speichern...\")\n    start = time.time()\n    stored = store_entities(doc_id, all_entities)\n    log(f\"   OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")\n\n    # 11. Zusammenfassung\n    log(\"=== ERGEBNIS ===\")\n    cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")\n    for row in cursor.fetchall():\n        log(f\"   {row['type']}: {row['cnt']}\")\n    cursor.close()\n\n    cursor = db.execute(\"SELECT COUNT(*) as cnt FROM entities\")\n    total = cursor.fetchone()[\"cnt\"]\n    cursor.close()\n    log(f\"   TOTAL: {total} entities\")\n\n    db.disconnect()\n    log(\"=== DONE ===\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
    }
}

Response

{
    "tool_response": {
        "type": "update",
        "filePath": "\/var\/www\/scripts\/pipeline\/run_demo.py",
        "content": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\nUses existing db.py methods for correct schema handling.\n\"\"\"\n\nimport json\nimport os\nimport sys\nimport time\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n    print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n    log(\"=== START ===\")\n\n    # 1. DB Connect\n    log(\"1. DB verbinden...\")\n    start = time.time()\n    db.connect()\n    log(f\"   OK ({time.time()-start:.1f}s)\")\n\n    # 2. DB Reset\n    log(\"2. DB Reset (Tabellen leeren)...\")\n    start = time.time()\n    tables_order = [\n        \"entity_relations\",\n        \"chunk_entities\",\n        \"document_entities\",\n        \"chunk_semantics\",\n        \"chunk_taxonomy\",\n        \"document_taxonomy\",\n        \"document_pages\",\n        \"entities\",\n        \"chunks\",\n        \"documents\",\n    ]\n    for table in tables_order:\n        try:\n            db.execute(f\"DELETE FROM {table}\")\n            db.commit()\n            log(f\"   {table}: OK\")\n        except Exception as e:\n            log(f\"   {table}: skip ({e})\")\n    log(f\"   DB Reset done ({time.time()-start:.1f}s)\")\n\n    # 3. Qdrant Reset\n    log(\"3. Qdrant Reset...\")\n    start = time.time()\n    try:\n        resp = requests.post(\n            \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",\n            json={\"filter\": {\"must\": []}},\n            timeout=10\n        )\n        log(f\"   Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")\n    except Exception as e:\n        log(f\"   Qdrant: {e}\")\n\n    # 4. PDF laden\n    log(\"4. PDF laden...\")\n    start = time.time()\n    from extract import extract_pdf\n    file_path = \"\/var\/www\/nextcloud\/data\/root\/files\/Documents\/demo.pdf\"\n    pages = extract_pdf(file_path)\n    text = \"\\n\\n\".join(p[\"text\"] for p in pages)\n    log(f\"   OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")\n\n    # 5. Document erstellen (nutze db.create_document)\n    log(\"5. Document in DB erstellen...\")\n    start = time.time()\n    doc_id = db.create_document(\n        file_path=file_path,\n        title=os.path.basename(file_path),\n        file_type=\"application\/pdf\",\n        file_size=os.path.getsize(file_path),\n        file_hash=\"demo_test\"\n    )\n    log(f\"   OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")\n\n    # 6. Chunking\n    log(\"6. Text chunken...\")\n    start = time.time()\n    from chunk import chunk_pdf\n    chunks = chunk_pdf(pages)\n    log(f\"   OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")\n\n    # 7. Chunks speichern (nutze db.insert_chunk)\n    log(\"7. Chunks in DB speichern...\")\n    start = time.time()\n    chunk_ids = []\n    chunk_texts = []\n    for i, chunk in enumerate(chunks):\n        content = chunk[\"content\"]\n        heading_path = json.dumps(chunk.get(\"heading_path\", []))\n        metadata = json.dumps(chunk.get(\"metadata\", {}))\n\n        chunk_id = db.insert_chunk(\n            doc_id=doc_id,\n            chunk_index=i,\n            content=content,\n            heading_path=heading_path,\n            metadata=metadata\n        )\n        chunk_ids.append(chunk_id)\n        chunk_texts.append(content)\n        log(f\"   Chunk {i+1}: {len(content)} chars -> id={chunk_id}\")\n    log(f\"   OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")\n\n    # 8. YAML Prompt laden\n    log(\"8. YAML Prompt aus DB laden...\")\n    start = time.time()\n    prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n    if prompt_data:\n        log(f\"   OK: Prompt geladen ({time.time()-start:.1f}s)\")\n        sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE\")\n        log(f\"   Prompt-Preview:\\n{sample_prompt[:400]}...\")\n    else:\n        log(\"   WARNUNG: Kein Prompt gefunden!\")\n\n    # 9. Entity Extraction pro Chunk\n    log(\"9. Entity Extraction (Ollama)...\")\n    all_entities = []\n    for i, (chunk_id, chunk_text) in enumerate(zip(chunk_ids, chunk_texts)):\n        log(f\"   Chunk {i+1}\/{len(chunk_texts)}: {len(chunk_text)} chars...\")\n        start = time.time()\n        entities = extract_entities_ollama(chunk_text)\n        log(f\"      -> {len(entities)} entities ({time.time()-start:.1f}s)\")\n        for e in entities[:5]:\n            log(f\"         - {e['name']} ({e['type']})\")\n        if len(entities) > 5:\n            log(f\"         ... und {len(entities)-5} weitere\")\n        all_entities.extend(entities)\n\n    # 10. Entities speichern (nutze store_entities)\n    log(\"10. Entities in DB speichern...\")\n    start = time.time()\n    stored = store_entities(doc_id, all_entities)\n    log(f\"   OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")\n\n    # 11. Zusammenfassung\n    log(\"=== ERGEBNIS ===\")\n    cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")\n    for row in cursor.fetchall():\n        log(f\"   {row['type']}: {row['cnt']}\")\n    cursor.close()\n\n    cursor = db.execute(\"SELECT COUNT(*) as cnt FROM entities\")\n    total = cursor.fetchone()[\"cnt\"]\n    cursor.close()\n    log(f\"   TOTAL: {total} entities\")\n\n    db.disconnect()\n    log(\"=== DONE ===\")\n\n\nif __name__ == \"__main__\":\n    main()\n",
        "structuredPatch": [
            {
                "oldStart": 2,
                "oldLines": 8,
                "newStart": 2,
                "newLines": 10,
                "lines": [
                    " \"\"\"",
                    " Verbose Pipeline Runner for demo.pdf",
                    " Shows every step with timing.",
                    "+Uses existing db.py methods for correct schema handling.",
                    " \"\"\"",
                    " ",
                    "+import json",
                    " import os",
                    " import sys",
                    " import time"
                ]
            },
            {
                "oldStart": 57,
                "oldLines": 7,
                "newStart": 59,
                "newLines": 6,
                "lines": [
                    "     log(\"3. Qdrant Reset...\")",
                    "     start = time.time()",
                    "     try:",
                    "-        # Delete all points",
                    "         resp = requests.post(",
                    "             \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",",
                    "             json={\"filter\": {\"must\": []}},"
                ]
            },
            {
                "oldStart": 76,
                "oldLines": 16,
                "newStart": 77,
                "newLines": 16,
                "lines": [
                    "     text = \"\\n\\n\".join(p[\"text\"] for p in pages)",
                    "     log(f\"   OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")",
                    " ",
                    "-    # 5. Document erstellen",
                    "+    # 5. Document erstellen (nutze db.create_document)",
                    "     log(\"5. Document in DB erstellen...\")",
                    "     start = time.time()",
                    "-    cursor = db.execute(",
                    "-        \"INSERT INTO documents (source_path, folder_path, filename, status) VALUES (%s, %s, %s, 'processing')\",",
                    "-        (file_path, os.path.dirname(file_path), os.path.basename(file_path))",
                    "+    doc_id = db.create_document(",
                    "+        file_path=file_path,",
                    "+        title=os.path.basename(file_path),",
                    "+        file_type=\"application\/pdf\",",
                    "+        file_size=os.path.getsize(file_path),",
                    "+        file_hash=\"demo_test\"",
                    "     )",
                    "-    db.commit()",
                    "-    doc_id = cursor.lastrowid",
                    "-    cursor.close()",
                    "     log(f\"   OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")",
                    " ",
                    "     # 6. Chunking"
                ]
            },
            {
                "oldStart": 95,
                "oldLines": 21,
                "newStart": 96,
                "newLines": 26,
                "lines": [
                    "     chunks = chunk_pdf(pages)",
                    "     log(f\"   OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")",
                    " ",
                    "-    # 7. Chunks speichern",
                    "+    # 7. Chunks speichern (nutze db.insert_chunk)",
                    "     log(\"7. Chunks in DB speichern...\")",
                    "     start = time.time()",
                    "     chunk_ids = []",
                    "     chunk_texts = []",
                    "     for i, chunk in enumerate(chunks):",
                    "         content = chunk[\"content\"]",
                    "-        cursor = db.execute(",
                    "-            \"INSERT INTO chunks (document_id, chunk_index, content, char_count, created_at) VALUES (%s, %s, %s, %s, NOW())\",",
                    "-            (doc_id, i, content, len(content))",
                    "+        heading_path = json.dumps(chunk.get(\"heading_path\", []))",
                    "+        metadata = json.dumps(chunk.get(\"metadata\", {}))",
                    "+",
                    "+        chunk_id = db.insert_chunk(",
                    "+            doc_id=doc_id,",
                    "+            chunk_index=i,",
                    "+            content=content,",
                    "+            heading_path=heading_path,",
                    "+            metadata=metadata",
                    "         )",
                    "-        db.commit()",
                    "-        chunk_ids.append(cursor.lastrowid)",
                    "+        chunk_ids.append(chunk_id)",
                    "         chunk_texts.append(content)",
                    "-        cursor.close()",
                    "+        log(f\"   Chunk {i+1}: {len(content)} chars -> id={chunk_id}\")",
                    "     log(f\"   OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")",
                    " ",
                    "     # 8. YAML Prompt laden"
                ]
            },
            {
                "oldStart": 118,
                "oldLines": 9,
                "newStart": 124,
                "newLines": 8,
                "lines": [
                    "     prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")",
                    "     if prompt_data:",
                    "         log(f\"   OK: Prompt geladen ({time.time()-start:.1f}s)\")",
                    "-        # Zeige generierten Prompt",
                    "-        sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE TEXT\")",
                    "-        log(f\"   Prompt-Struktur:\\n{sample_prompt[:300]}...\")",
                    "+        sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE\")",
                    "+        log(f\"   Prompt-Preview:\\n{sample_prompt[:400]}...\")",
                    "     else:",
                    "         log(\"   WARNUNG: Kein Prompt gefunden!\")",
                    " "
                ]
            },
            {
                "oldStart": 138,
                "oldLines": 19,
                "newStart": 143,
                "newLines": 24,
                "lines": [
                    "             log(f\"         ... und {len(entities)-5} weitere\")",
                    "         all_entities.extend(entities)",
                    " ",
                    "-    # 10. Entities speichern",
                    "+    # 10. Entities speichern (nutze store_entities)",
                    "     log(\"10. Entities in DB speichern...\")",
                    "     start = time.time()",
                    "     stored = store_entities(doc_id, all_entities)",
                    "     log(f\"   OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")",
                    " ",
                    "-    # 9. Zusammenfassung",
                    "+    # 11. Zusammenfassung",
                    "     log(\"=== ERGEBNIS ===\")",
                    "     cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")",
                    "     for row in cursor.fetchall():",
                    "         log(f\"   {row['type']}: {row['cnt']}\")",
                    "     cursor.close()",
                    " ",
                    "+    cursor = db.execute(\"SELECT COUNT(*) as cnt FROM entities\")",
                    "+    total = cursor.fetchone()[\"cnt\"]",
                    "+    cursor.close()",
                    "+    log(f\"   TOTAL: {total} entities\")",
                    "+",
                    "     db.disconnect()",
                    "     log(\"=== DONE ===\")",
                    " "
                ]
            }
        ],
        "originalFile": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\n\"\"\"\n\nimport os\nimport sys\nimport time\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n    print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n    log(\"=== START ===\")\n\n    # 1. DB Connect\n    log(\"1. DB verbinden...\")\n    start = time.time()\n    db.connect()\n    log(f\"   OK ({time.time()-start:.1f}s)\")\n\n    # 2. DB Reset\n    log(\"2. DB Reset (Tabellen leeren)...\")\n    start = time.time()\n    tables_order = [\n        \"entity_relations\",\n        \"chunk_entities\",\n        \"document_entities\",\n        \"chunk_semantics\",\n        \"chunk_taxonomy\",\n        \"document_taxonomy\",\n        \"document_pages\",\n        \"entities\",\n        \"chunks\",\n        \"documents\",\n    ]\n    for table in tables_order:\n        try:\n            db.execute(f\"DELETE FROM {table}\")\n            db.commit()\n            log(f\"   {table}: OK\")\n        except Exception as e:\n            log(f\"   {table}: skip ({e})\")\n    log(f\"   DB Reset done ({time.time()-start:.1f}s)\")\n\n    # 3. Qdrant Reset\n    log(\"3. Qdrant Reset...\")\n    start = time.time()\n    try:\n        # Delete all points\n        resp = requests.post(\n            \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",\n            json={\"filter\": {\"must\": []}},\n            timeout=10\n        )\n        log(f\"   Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")\n    except Exception as e:\n        log(f\"   Qdrant: {e}\")\n\n    # 4. PDF laden\n    log(\"4. PDF laden...\")\n    start = time.time()\n    from extract import extract_pdf\n    file_path = \"\/var\/www\/nextcloud\/data\/root\/files\/Documents\/demo.pdf\"\n    pages = extract_pdf(file_path)\n    text = \"\\n\\n\".join(p[\"text\"] for p in pages)\n    log(f\"   OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")\n\n    # 5. Document erstellen\n    log(\"5. Document in DB erstellen...\")\n    start = time.time()\n    cursor = db.execute(\n        \"INSERT INTO documents (source_path, folder_path, filename, status) VALUES (%s, %s, %s, 'processing')\",\n        (file_path, os.path.dirname(file_path), os.path.basename(file_path))\n    )\n    db.commit()\n    doc_id = cursor.lastrowid\n    cursor.close()\n    log(f\"   OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")\n\n    # 6. Chunking\n    log(\"6. Text chunken...\")\n    start = time.time()\n    from chunk import chunk_pdf\n    chunks = chunk_pdf(pages)\n    log(f\"   OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")\n\n    # 7. Chunks speichern\n    log(\"7. Chunks in DB speichern...\")\n    start = time.time()\n    chunk_ids = []\n    chunk_texts = []\n    for i, chunk in enumerate(chunks):\n        content = chunk[\"content\"]\n        cursor = db.execute(\n            \"INSERT INTO chunks (document_id, chunk_index, content, char_count, created_at) VALUES (%s, %s, %s, %s, NOW())\",\n            (doc_id, i, content, len(content))\n        )\n        db.commit()\n        chunk_ids.append(cursor.lastrowid)\n        chunk_texts.append(content)\n        cursor.close()\n    log(f\"   OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")\n\n    # 8. YAML Prompt laden\n    log(\"8. YAML Prompt aus DB laden...\")\n    start = time.time()\n    prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n    if prompt_data:\n        log(f\"   OK: Prompt geladen ({time.time()-start:.1f}s)\")\n        # Zeige generierten Prompt\n        sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE TEXT\")\n        log(f\"   Prompt-Struktur:\\n{sample_prompt[:300]}...\")\n    else:\n        log(\"   WARNUNG: Kein Prompt gefunden!\")\n\n    # 9. Entity Extraction pro Chunk\n    log(\"9. Entity Extraction (Ollama)...\")\n    all_entities = []\n    for i, (chunk_id, chunk_text) in enumerate(zip(chunk_ids, chunk_texts)):\n        log(f\"   Chunk {i+1}\/{len(chunk_texts)}: {len(chunk_text)} chars...\")\n        start = time.time()\n        entities = extract_entities_ollama(chunk_text)\n        log(f\"      -> {len(entities)} entities ({time.time()-start:.1f}s)\")\n        for e in entities[:5]:\n            log(f\"         - {e['name']} ({e['type']})\")\n        if len(entities) > 5:\n            log(f\"         ... und {len(entities)-5} weitere\")\n        all_entities.extend(entities)\n\n    # 10. Entities speichern\n    log(\"10. Entities in DB speichern...\")\n    start = time.time()\n    stored = store_entities(doc_id, all_entities)\n    log(f\"   OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")\n\n    # 9. Zusammenfassung\n    log(\"=== ERGEBNIS ===\")\n    cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")\n    for row in cursor.fetchall():\n        log(f\"   {row['type']}: {row['cnt']}\")\n    cursor.close()\n\n    db.disconnect()\n    log(\"=== DONE ===\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
    }
}

← Vorheriger Zur Liste Nächster →