{
"event": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/run_demo.py",
"old_string": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\n\"\"\"\n\nimport sys\nimport time\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n log(\"=== START ===\")\n\n # 1. DB Connect\n log(\"1. DB verbinden...\")\n start = time.time()\n db.connect()\n log(f\" OK ({time.time()-start:.1f}s)\")",
"new_string": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\n\"\"\"\n\nimport sys\nimport time\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n log(\"=== START ===\")\n\n # 1. DB Connect\n log(\"1. DB verbinden...\")\n start = time.time()\n db.connect()\n log(f\" OK ({time.time()-start:.1f}s)\")\n\n # 2. DB Reset\n log(\"2. DB Reset (Tabellen leeren)...\")\n start = time.time()\n tables_order = [\n \"entity_relations\",\n \"chunk_entities\",\n \"document_entities\",\n \"chunk_semantics\",\n \"chunk_taxonomy\",\n \"document_taxonomy\",\n \"document_pages\",\n \"entities\",\n \"chunks\",\n \"documents\",\n ]\n for table in tables_order:\n try:\n db.execute(f\"DELETE FROM {table}\")\n db.commit()\n log(f\" {table}: OK\")\n except Exception as e:\n log(f\" {table}: skip ({e})\")\n log(f\" DB Reset done ({time.time()-start:.1f}s)\")\n\n # 3. Qdrant Reset\n log(\"3. Qdrant Reset...\")\n start = time.time()\n try:\n # Delete all points\n resp = requests.post(\n \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",\n json={\"filter\": {\"must\": []}},\n timeout=10\n )\n log(f\" Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")\n except Exception as e:\n log(f\" Qdrant: {e}\")"
}
}
{
"tool_response": {
"filePath": "\/var\/www\/scripts\/pipeline\/run_demo.py",
"oldString": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\n\"\"\"\n\nimport sys\nimport time\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n log(\"=== START ===\")\n\n # 1. DB Connect\n log(\"1. DB verbinden...\")\n start = time.time()\n db.connect()\n log(f\" OK ({time.time()-start:.1f}s)\")",
"newString": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\n\"\"\"\n\nimport sys\nimport time\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n log(\"=== START ===\")\n\n # 1. DB Connect\n log(\"1. DB verbinden...\")\n start = time.time()\n db.connect()\n log(f\" OK ({time.time()-start:.1f}s)\")\n\n # 2. DB Reset\n log(\"2. DB Reset (Tabellen leeren)...\")\n start = time.time()\n tables_order = [\n \"entity_relations\",\n \"chunk_entities\",\n \"document_entities\",\n \"chunk_semantics\",\n \"chunk_taxonomy\",\n \"document_taxonomy\",\n \"document_pages\",\n \"entities\",\n \"chunks\",\n \"documents\",\n ]\n for table in tables_order:\n try:\n db.execute(f\"DELETE FROM {table}\")\n db.commit()\n log(f\" {table}: OK\")\n except Exception as e:\n log(f\" {table}: skip ({e})\")\n log(f\" DB Reset done ({time.time()-start:.1f}s)\")\n\n # 3. Qdrant Reset\n log(\"3. Qdrant Reset...\")\n start = time.time()\n try:\n # Delete all points\n resp = requests.post(\n \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",\n json={\"filter\": {\"must\": []}},\n timeout=10\n )\n log(f\" Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")\n except Exception as e:\n log(f\" Qdrant: {e}\")",
"originalFile": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\n\"\"\"\n\nimport sys\nimport time\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n log(\"=== START ===\")\n\n # 1. DB Connect\n log(\"1. DB verbinden...\")\n start = time.time()\n db.connect()\n log(f\" OK ({time.time()-start:.1f}s)\")\n\n # 2. PDF laden\n log(\"2. PDF laden...\")\n start = time.time()\n from loaders.pdf_loader import load_pdf\n file_path = \"\/var\/www\/nextcloud\/data\/root\/files\/Documents\/demo.pdf\"\n text, pages = load_pdf(file_path)\n log(f\" OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")\n\n # 3. Document erstellen\n log(\"3. Document in DB erstellen...\")\n start = time.time()\n cursor = db.execute(\n \"INSERT INTO documents (title, source_path, status, created_at) VALUES (%s, %s, 'processing', NOW())\",\n (\"demo.pdf\", file_path)\n )\n db.commit()\n doc_id = cursor.lastrowid\n cursor.close()\n log(f\" OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")\n\n # 4. Chunking\n log(\"4. Text chunken...\")\n start = time.time()\n from chunkers.semantic_chunker import semantic_chunk\n chunks = semantic_chunk(text, max_tokens=500)\n log(f\" OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")\n\n # 5. Chunks speichern\n log(\"5. Chunks in DB speichern...\")\n start = time.time()\n chunk_ids = []\n for i, chunk in enumerate(chunks):\n cursor = db.execute(\n \"INSERT INTO chunks (document_id, chunk_index, content, char_count, created_at) VALUES (%s, %s, %s, %s, NOW())\",\n (doc_id, i, chunk, len(chunk))\n )\n db.commit()\n chunk_ids.append(cursor.lastrowid)\n cursor.close()\n log(f\" OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")\n\n # 6. YAML Prompt laden\n log(\"6. YAML Prompt aus DB laden...\")\n start = time.time()\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n if prompt_data:\n log(f\" OK: Prompt geladen ({time.time()-start:.1f}s)\")\n # Zeige generierten Prompt\n sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE TEXT\")\n log(f\" Prompt-Struktur:\\n{sample_prompt[:300]}...\")\n else:\n log(\" WARNUNG: Kein Prompt gefunden!\")\n\n # 7. Entity Extraction pro Chunk\n log(\"7. Entity Extraction (Ollama)...\")\n all_entities = []\n for i, (chunk_id, chunk_text) in enumerate(zip(chunk_ids, chunks)):\n log(f\" Chunk {i+1}\/{len(chunks)}: {len(chunk_text)} chars...\")\n start = time.time()\n entities = extract_entities_ollama(chunk_text)\n log(f\" -> {len(entities)} entities ({time.time()-start:.1f}s)\")\n for e in entities[:5]:\n log(f\" - {e['name']} ({e['type']})\")\n if len(entities) > 5:\n log(f\" ... und {len(entities)-5} weitere\")\n all_entities.extend(entities)\n\n # 8. Entities speichern\n log(\"8. Entities in DB speichern...\")\n start = time.time()\n stored = store_entities(doc_id, all_entities)\n log(f\" OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")\n\n # 9. Zusammenfassung\n log(\"=== ERGEBNIS ===\")\n cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")\n for row in cursor.fetchall():\n log(f\" {row['type']}: {row['cnt']}\")\n cursor.close()\n\n db.disconnect()\n log(\"=== DONE ===\")\n\n\nif __name__ == \"__main__\":\n main()\n",
"structuredPatch": [
{
"oldStart": 7,
"oldLines": 6,
"newStart": 7,
"newLines": 8,
"lines": [
" import sys",
" import time",
" ",
"+import requests",
"+",
" sys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")",
" ",
" from db import db"
]
},
{
"oldStart": 26,
"oldLines": 6,
"newStart": 28,
"newLines": 44,
"lines": [
" db.connect()",
" log(f\" OK ({time.time()-start:.1f}s)\")",
" ",
"+ # 2. DB Reset",
"+ log(\"2. DB Reset (Tabellen leeren)...\")",
"+ start = time.time()",
"+ tables_order = [",
"+ \"entity_relations\",",
"+ \"chunk_entities\",",
"+ \"document_entities\",",
"+ \"chunk_semantics\",",
"+ \"chunk_taxonomy\",",
"+ \"document_taxonomy\",",
"+ \"document_pages\",",
"+ \"entities\",",
"+ \"chunks\",",
"+ \"documents\",",
"+ ]",
"+ for table in tables_order:",
"+ try:",
"+ db.execute(f\"DELETE FROM {table}\")",
"+ db.commit()",
"+ log(f\" {table}: OK\")",
"+ except Exception as e:",
"+ log(f\" {table}: skip ({e})\")",
"+ log(f\" DB Reset done ({time.time()-start:.1f}s)\")",
"+",
"+ # 3. Qdrant Reset",
"+ log(\"3. Qdrant Reset...\")",
"+ start = time.time()",
"+ try:",
"+ # Delete all points",
"+ resp = requests.post(",
"+ \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",",
"+ json={\"filter\": {\"must\": []}},",
"+ timeout=10",
"+ )",
"+ log(f\" Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")",
"+ except Exception as e:",
"+ log(f\" Qdrant: {e}\")",
"+",
" # 2. PDF laden",
" log(\"2. PDF laden...\")",
" start = time.time()"
]
}
],
"userModified": false,
"replaceAll": false
}
}