{
"tool_response": {
"type": "update",
"filePath": "\/var\/www\/scripts\/pipeline\/run_demo.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\nUses existing db.py methods for correct schema handling.\n\"\"\"\n\nimport json\nimport os\nimport sys\nimport time\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n log(\"=== START ===\")\n\n # 1. DB Connect\n log(\"1. DB verbinden...\")\n start = time.time()\n db.connect()\n log(f\" OK ({time.time()-start:.1f}s)\")\n\n # 2. DB Reset\n log(\"2. DB Reset (Tabellen leeren)...\")\n start = time.time()\n tables_order = [\n \"entity_relations\",\n \"chunk_entities\",\n \"document_entities\",\n \"chunk_semantics\",\n \"chunk_taxonomy\",\n \"document_taxonomy\",\n \"document_pages\",\n \"entities\",\n \"chunks\",\n \"documents\",\n ]\n for table in tables_order:\n try:\n db.execute(f\"DELETE FROM {table}\")\n db.commit()\n log(f\" {table}: OK\")\n except Exception as e:\n log(f\" {table}: skip ({e})\")\n log(f\" DB Reset done ({time.time()-start:.1f}s)\")\n\n # 3. Qdrant Reset\n log(\"3. Qdrant Reset...\")\n start = time.time()\n try:\n resp = requests.post(\n \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",\n json={\"filter\": {\"must\": []}},\n timeout=10\n )\n log(f\" Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")\n except Exception as e:\n log(f\" Qdrant: {e}\")\n\n # 4. PDF laden\n log(\"4. PDF laden...\")\n start = time.time()\n from extract import extract_pdf\n file_path = \"\/var\/www\/nextcloud\/data\/root\/files\/Documents\/demo.pdf\"\n pages = extract_pdf(file_path)\n text = \"\\n\\n\".join(p[\"text\"] for p in pages)\n log(f\" OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")\n\n # 5. Document erstellen (nutze db.create_document)\n log(\"5. Document in DB erstellen...\")\n start = time.time()\n doc_id = db.create_document(\n file_path=file_path,\n title=os.path.basename(file_path),\n file_type=\"application\/pdf\",\n file_size=os.path.getsize(file_path),\n file_hash=\"demo_test\"\n )\n log(f\" OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")\n\n # 6. Chunking\n log(\"6. Text chunken...\")\n start = time.time()\n from chunk import chunk_pdf\n chunks = chunk_pdf(pages)\n log(f\" OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")\n\n # 7. Chunks speichern (nutze db.insert_chunk)\n log(\"7. Chunks in DB speichern...\")\n start = time.time()\n chunk_ids = []\n chunk_texts = []\n for i, chunk in enumerate(chunks):\n content = chunk[\"content\"]\n heading_path = json.dumps(chunk.get(\"heading_path\", []))\n metadata = json.dumps(chunk.get(\"metadata\", {}))\n\n chunk_id = db.insert_chunk(\n doc_id=doc_id,\n chunk_index=i,\n content=content,\n heading_path=heading_path,\n metadata=metadata\n )\n chunk_ids.append(chunk_id)\n chunk_texts.append(content)\n log(f\" Chunk {i+1}: {len(content)} chars -> id={chunk_id}\")\n log(f\" OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")\n\n # 8. YAML Prompt laden\n log(\"8. YAML Prompt aus DB laden...\")\n start = time.time()\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n if prompt_data:\n log(f\" OK: Prompt geladen ({time.time()-start:.1f}s)\")\n sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE\")\n log(f\" Prompt-Preview:\\n{sample_prompt[:400]}...\")\n else:\n log(\" WARNUNG: Kein Prompt gefunden!\")\n\n # 9. Entity Extraction pro Chunk\n log(\"9. Entity Extraction (Ollama)...\")\n all_entities = []\n for i, (chunk_id, chunk_text) in enumerate(zip(chunk_ids, chunk_texts)):\n log(f\" Chunk {i+1}\/{len(chunk_texts)}: {len(chunk_text)} chars...\")\n start = time.time()\n entities = extract_entities_ollama(chunk_text)\n log(f\" -> {len(entities)} entities ({time.time()-start:.1f}s)\")\n for e in entities[:5]:\n log(f\" - {e['name']} ({e['type']})\")\n if len(entities) > 5:\n log(f\" ... und {len(entities)-5} weitere\")\n all_entities.extend(entities)\n\n # 10. Entities speichern (nutze store_entities)\n log(\"10. Entities in DB speichern...\")\n start = time.time()\n stored = store_entities(doc_id, all_entities)\n log(f\" OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")\n\n # 11. Zusammenfassung\n log(\"=== ERGEBNIS ===\")\n cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")\n for row in cursor.fetchall():\n log(f\" {row['type']}: {row['cnt']}\")\n cursor.close()\n\n cursor = db.execute(\"SELECT COUNT(*) as cnt FROM entities\")\n total = cursor.fetchone()[\"cnt\"]\n cursor.close()\n log(f\" TOTAL: {total} entities\")\n\n db.disconnect()\n log(\"=== DONE ===\")\n\n\nif __name__ == \"__main__\":\n main()\n",
"structuredPatch": [
{
"oldStart": 2,
"oldLines": 8,
"newStart": 2,
"newLines": 10,
"lines": [
" \"\"\"",
" Verbose Pipeline Runner for demo.pdf",
" Shows every step with timing.",
"+Uses existing db.py methods for correct schema handling.",
" \"\"\"",
" ",
"+import json",
" import os",
" import sys",
" import time"
]
},
{
"oldStart": 57,
"oldLines": 7,
"newStart": 59,
"newLines": 6,
"lines": [
" log(\"3. Qdrant Reset...\")",
" start = time.time()",
" try:",
"- # Delete all points",
" resp = requests.post(",
" \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",",
" json={\"filter\": {\"must\": []}},"
]
},
{
"oldStart": 76,
"oldLines": 16,
"newStart": 77,
"newLines": 16,
"lines": [
" text = \"\\n\\n\".join(p[\"text\"] for p in pages)",
" log(f\" OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")",
" ",
"- # 5. Document erstellen",
"+ # 5. Document erstellen (nutze db.create_document)",
" log(\"5. Document in DB erstellen...\")",
" start = time.time()",
"- cursor = db.execute(",
"- \"INSERT INTO documents (source_path, folder_path, filename, status) VALUES (%s, %s, %s, 'processing')\",",
"- (file_path, os.path.dirname(file_path), os.path.basename(file_path))",
"+ doc_id = db.create_document(",
"+ file_path=file_path,",
"+ title=os.path.basename(file_path),",
"+ file_type=\"application\/pdf\",",
"+ file_size=os.path.getsize(file_path),",
"+ file_hash=\"demo_test\"",
" )",
"- db.commit()",
"- doc_id = cursor.lastrowid",
"- cursor.close()",
" log(f\" OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")",
" ",
" # 6. Chunking"
]
},
{
"oldStart": 95,
"oldLines": 21,
"newStart": 96,
"newLines": 26,
"lines": [
" chunks = chunk_pdf(pages)",
" log(f\" OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")",
" ",
"- # 7. Chunks speichern",
"+ # 7. Chunks speichern (nutze db.insert_chunk)",
" log(\"7. Chunks in DB speichern...\")",
" start = time.time()",
" chunk_ids = []",
" chunk_texts = []",
" for i, chunk in enumerate(chunks):",
" content = chunk[\"content\"]",
"- cursor = db.execute(",
"- \"INSERT INTO chunks (document_id, chunk_index, content, char_count, created_at) VALUES (%s, %s, %s, %s, NOW())\",",
"- (doc_id, i, content, len(content))",
"+ heading_path = json.dumps(chunk.get(\"heading_path\", []))",
"+ metadata = json.dumps(chunk.get(\"metadata\", {}))",
"+",
"+ chunk_id = db.insert_chunk(",
"+ doc_id=doc_id,",
"+ chunk_index=i,",
"+ content=content,",
"+ heading_path=heading_path,",
"+ metadata=metadata",
" )",
"- db.commit()",
"- chunk_ids.append(cursor.lastrowid)",
"+ chunk_ids.append(chunk_id)",
" chunk_texts.append(content)",
"- cursor.close()",
"+ log(f\" Chunk {i+1}: {len(content)} chars -> id={chunk_id}\")",
" log(f\" OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")",
" ",
" # 8. YAML Prompt laden"
]
},
{
"oldStart": 118,
"oldLines": 9,
"newStart": 124,
"newLines": 8,
"lines": [
" prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")",
" if prompt_data:",
" log(f\" OK: Prompt geladen ({time.time()-start:.1f}s)\")",
"- # Zeige generierten Prompt",
"- sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE TEXT\")",
"- log(f\" Prompt-Struktur:\\n{sample_prompt[:300]}...\")",
"+ sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE\")",
"+ log(f\" Prompt-Preview:\\n{sample_prompt[:400]}...\")",
" else:",
" log(\" WARNUNG: Kein Prompt gefunden!\")",
" "
]
},
{
"oldStart": 138,
"oldLines": 19,
"newStart": 143,
"newLines": 24,
"lines": [
" log(f\" ... und {len(entities)-5} weitere\")",
" all_entities.extend(entities)",
" ",
"- # 10. Entities speichern",
"+ # 10. Entities speichern (nutze store_entities)",
" log(\"10. Entities in DB speichern...\")",
" start = time.time()",
" stored = store_entities(doc_id, all_entities)",
" log(f\" OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")",
" ",
"- # 9. Zusammenfassung",
"+ # 11. Zusammenfassung",
" log(\"=== ERGEBNIS ===\")",
" cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")",
" for row in cursor.fetchall():",
" log(f\" {row['type']}: {row['cnt']}\")",
" cursor.close()",
" ",
"+ cursor = db.execute(\"SELECT COUNT(*) as cnt FROM entities\")",
"+ total = cursor.fetchone()[\"cnt\"]",
"+ cursor.close()",
"+ log(f\" TOTAL: {total} entities\")",
"+",
" db.disconnect()",
" log(\"=== DONE ===\")",
" "
]
}
],
"originalFile": "#!\/usr\/bin\/env python3\n\"\"\"\nVerbose Pipeline Runner for demo.pdf\nShows every step with timing.\n\"\"\"\n\nimport os\nimport sys\nimport time\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom db import db\nfrom analyzers.entity_extractor import extract_entities_ollama, store_entities, _build_prompt_from_yaml\n\n\ndef log(msg):\n print(f\"[{time.strftime('%H:%M:%S')}] {msg}\", flush=True)\n\n\ndef main():\n log(\"=== START ===\")\n\n # 1. DB Connect\n log(\"1. DB verbinden...\")\n start = time.time()\n db.connect()\n log(f\" OK ({time.time()-start:.1f}s)\")\n\n # 2. DB Reset\n log(\"2. DB Reset (Tabellen leeren)...\")\n start = time.time()\n tables_order = [\n \"entity_relations\",\n \"chunk_entities\",\n \"document_entities\",\n \"chunk_semantics\",\n \"chunk_taxonomy\",\n \"document_taxonomy\",\n \"document_pages\",\n \"entities\",\n \"chunks\",\n \"documents\",\n ]\n for table in tables_order:\n try:\n db.execute(f\"DELETE FROM {table}\")\n db.commit()\n log(f\" {table}: OK\")\n except Exception as e:\n log(f\" {table}: skip ({e})\")\n log(f\" DB Reset done ({time.time()-start:.1f}s)\")\n\n # 3. Qdrant Reset\n log(\"3. Qdrant Reset...\")\n start = time.time()\n try:\n # Delete all points\n resp = requests.post(\n \"http:\/\/localhost:6333\/collections\/documents\/points\/delete\",\n json={\"filter\": {\"must\": []}},\n timeout=10\n )\n log(f\" Qdrant: {resp.status_code} ({time.time()-start:.1f}s)\")\n except Exception as e:\n log(f\" Qdrant: {e}\")\n\n # 4. PDF laden\n log(\"4. PDF laden...\")\n start = time.time()\n from extract import extract_pdf\n file_path = \"\/var\/www\/nextcloud\/data\/root\/files\/Documents\/demo.pdf\"\n pages = extract_pdf(file_path)\n text = \"\\n\\n\".join(p[\"text\"] for p in pages)\n log(f\" OK: {len(text)} chars, {len(pages)} pages ({time.time()-start:.1f}s)\")\n\n # 5. Document erstellen\n log(\"5. Document in DB erstellen...\")\n start = time.time()\n cursor = db.execute(\n \"INSERT INTO documents (source_path, folder_path, filename, status) VALUES (%s, %s, %s, 'processing')\",\n (file_path, os.path.dirname(file_path), os.path.basename(file_path))\n )\n db.commit()\n doc_id = cursor.lastrowid\n cursor.close()\n log(f\" OK: doc_id={doc_id} ({time.time()-start:.1f}s)\")\n\n # 6. Chunking\n log(\"6. Text chunken...\")\n start = time.time()\n from chunk import chunk_pdf\n chunks = chunk_pdf(pages)\n log(f\" OK: {len(chunks)} chunks ({time.time()-start:.1f}s)\")\n\n # 7. Chunks speichern\n log(\"7. Chunks in DB speichern...\")\n start = time.time()\n chunk_ids = []\n chunk_texts = []\n for i, chunk in enumerate(chunks):\n content = chunk[\"content\"]\n cursor = db.execute(\n \"INSERT INTO chunks (document_id, chunk_index, content, char_count, created_at) VALUES (%s, %s, %s, %s, NOW())\",\n (doc_id, i, content, len(content))\n )\n db.commit()\n chunk_ids.append(cursor.lastrowid)\n chunk_texts.append(content)\n cursor.close()\n log(f\" OK: {len(chunk_ids)} chunks gespeichert ({time.time()-start:.1f}s)\")\n\n # 8. YAML Prompt laden\n log(\"8. YAML Prompt aus DB laden...\")\n start = time.time()\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n if prompt_data:\n log(f\" OK: Prompt geladen ({time.time()-start:.1f}s)\")\n # Zeige generierten Prompt\n sample_prompt = _build_prompt_from_yaml(prompt_data[\"content\"], \"SAMPLE TEXT\")\n log(f\" Prompt-Struktur:\\n{sample_prompt[:300]}...\")\n else:\n log(\" WARNUNG: Kein Prompt gefunden!\")\n\n # 9. Entity Extraction pro Chunk\n log(\"9. Entity Extraction (Ollama)...\")\n all_entities = []\n for i, (chunk_id, chunk_text) in enumerate(zip(chunk_ids, chunk_texts)):\n log(f\" Chunk {i+1}\/{len(chunk_texts)}: {len(chunk_text)} chars...\")\n start = time.time()\n entities = extract_entities_ollama(chunk_text)\n log(f\" -> {len(entities)} entities ({time.time()-start:.1f}s)\")\n for e in entities[:5]:\n log(f\" - {e['name']} ({e['type']})\")\n if len(entities) > 5:\n log(f\" ... und {len(entities)-5} weitere\")\n all_entities.extend(entities)\n\n # 10. Entities speichern\n log(\"10. Entities in DB speichern...\")\n start = time.time()\n stored = store_entities(doc_id, all_entities)\n log(f\" OK: {stored} entities gespeichert ({time.time()-start:.1f}s)\")\n\n # 9. Zusammenfassung\n log(\"=== ERGEBNIS ===\")\n cursor = db.execute(\"SELECT type, COUNT(*) as cnt FROM entities GROUP BY type ORDER BY cnt DESC\")\n for row in cursor.fetchall():\n log(f\" {row['type']}: {row['cnt']}\")\n cursor.close()\n\n db.disconnect()\n log(\"=== DONE ===\")\n\n\nif __name__ == \"__main__\":\n main()\n"
}
}