{
"tool_response": [
{
"type": "text",
"text": "{\n \"success\": true,\n \"doc\": {\n \"id\": 88,\n \"parent_id\": 87,\n \"slug\": \"import\",\n \"path\": \"\/anwendungen\/content-pipeline\/import\",\n \"title\": \"Import Pipeline\",\n \"description\": \"Planung und Analyse des Import-Prozesses\",\n \"content\": \"\\n<p class=\\\"doc-meta\\\"><strong>Erstellt:<\/strong> 2025-12-22 | <strong>Aktualisiert:<\/strong> 2025-12-31<\/p>\\n<h2>Import Pipeline - Planungsdokument<\/h2>\\n\\n<h3>1. Bestehendes System (IST-Analyse)<\/h3>\\n\\n<h4>1.1 Python-Skripte unter \/var\/www\/scripts\/pipeline\/<\/h4>\\n<table>\\n<tr><th>Datei<\/th><th>Funktion<\/th><th>Kernlogik<\/th><\/tr>\\n<tr><td>pipeline.py<\/td><td>Orchestrator<\/td><td>CLI mit scan, process, embed, all, file, status<\/td><\/tr>\\n<tr><td>config.py<\/td><td>Konfiguration<\/td><td>Hardcoded Pfade, Modelle, Limits<\/td><\/tr>\\n<tr><td>detect.py<\/td><td>Datei-Erkennung<\/td><td>Scan Nextcloud, Hash-Vergleich, Queue<\/td><\/tr>\\n<tr><td>extract.py<\/td><td>Text-Extraktion<\/td><td>PDF (OCR), DOCX, PPTX, MD, TXT<\/td><\/tr>\\n<tr><td>chunk.py<\/td><td>Chunking<\/td><td>Semantisch nach Typ, Heading-Pfad<\/td><\/tr>\\n<tr><td>embed.py<\/td><td>Embedding<\/td><td>Ollama → Qdrant<\/td><\/tr>\\n<tr><td>analyze.py<\/td><td>Semantische Analyse<\/td><td>Entitäten, Relationen, Taxonomie<\/td><\/tr>\\n<tr><td>db.py<\/td><td>Datenbank-Wrapper<\/td><td>CRUD für documents, chunks, queue<\/td><\/tr>\\n<\/table>\\n\\n<h4>1.2 Datenfluss<\/h4>\\n<pre>\\nNextcloud (Files)\\n ↓\\n [detect.py] Scan + Hash\\n ↓\\n documents (DB) status=pending\\n ↓\\n [extract.py] PDF\/DOCX\/... → Text\\n ↓\\n [chunk.py] Semantisches Chunking\\n ↓\\n chunks (DB) + heading_path, metadata\\n ↓\\n [embed.py] Ollama mxbai-embed-large\\n ↓\\n Qdrant (Vektoren) + chunks.qdrant_id\\n ↓\\n [analyze.py] Entity\/Relation\/Taxonomy\\n ↓\\n entities, entity_relations, chunk_entities,\\n chunk_taxonomy, chunk_semantics (DB)\\n<\/pre>\\n\\n<h4>1.3 Aktuelle Konfiguration (config.py)<\/h4>\\n<table>\\n<tr><th>Parameter<\/th><th>Wert<\/th><\/tr>\\n<tr><td>NEXTCLOUD_PATH<\/td><td>\/var\/www\/nextcloud\/data\/root\/files\/Documents<\/td><\/tr>\\n<tr><td>SUPPORTED_EXTENSIONS<\/td><td>.pdf, .pptx, .docx, .md, .txt<\/td><\/tr>\\n<tr><td>QDRANT_HOST<\/td><td>localhost:6333<\/td><\/tr>\\n<tr><td>QDRANT_COLLECTIONS<\/td><td>documents, mail, entities<\/td><\/tr>\\n<tr><td>OLLAMA_HOST<\/td><td>localhost:11434<\/td><\/tr>\\n<tr><td>EMBED_MODEL<\/td><td>mxbai-embed-large (1024 dims)<\/td><\/tr>\\n<tr><td>MIN_CHUNK_SIZE<\/td><td>100 Zeichen<\/td><\/tr>\\n<tr><td>MAX_CHUNK_SIZE<\/td><td>2000 Zeichen<\/td><\/tr>\\n<tr><td>CHUNK_OVERLAP<\/td><td>10%<\/td><\/tr>\\n<\/table>\\n\\n<h4>1.4 Datenbank-Struktur (ki_content)<\/h4>\\n\\n<h5>documents<\/h5>\\n<pre>\\nid INT PK AUTO\\nsource_path VARCHAR(500) UNIQUE\\nfolder_path VARCHAR(500)\\nfilename VARCHAR(255)\\nmime_type VARCHAR(100)\\nfile_hash VARCHAR(64) - SHA256 für Änderungserkennung\\nfile_size INT\\nlanguage VARCHAR(10) DEFAULT 'de'\\nimported_at DATETIME DEFAULT CURRENT_TIMESTAMP\\nprocessed_at DATETIME\\nstatus ENUM('pending','importing','imported','chunking','chunked',\\n 'embedding','embedded','enriching','enriched','processing','done','error')\\nsemantic_status ENUM('pending','processing','partial','complete','error','skipped')\\nerror_message TEXT\\nauthority_score FLOAT DEFAULT 0.5\\n<\/pre>\\n\\n<h5>chunks<\/h5>\\n<pre>\\nid INT PK AUTO\\ndocument_id INT FK\\npage_id INT FK\\nchunk_index INT\\ncontent TEXT\\ntoken_count INT\\nheading_path JSON - [\\\"H1\\\", \\\"H2\\\", ...]\\nmetadata JSON\\nqdrant_id VARCHAR(36) - UUID in Qdrant\\nstatus ENUM('created','embedding','embedded','error','deprecated')\\ncreated_at DATETIME DEFAULT CURRENT_TIMESTAMP\\nsection_id INT\\n<\/pre>\\n\\n<h5>entities<\/h5>\\n<pre>\\nid INT PK AUTO\\nname VARCHAR(255)\\ntype ENUM('PERSON','ORGANIZATION','LOCATION','EVENT','ROLE','TOOL',\\n 'ARTIFACT','METAPHOR','METHOD','THEORY','MODEL','PRINCIPLE',\\n 'DATE_TIME','QUANTITY_MEASURE','LAW_REGULATION','DIAGNOSIS_CONDITION',\\n 'SYMPTOM_SIGN','ASSESSMENT_INSTRUMENT','PUBLICATION_WORK',\\n 'DEMOGRAPHIC_GROUP','VALUE_NORM_RIGHT_DUTY','CONCEPT',\\n 'INTERVENTION_EXERCISE','FACILITATION_FORMAT','PROCESS_PHASE_STEP',\\n 'QUESTION_TYPE','EMOTION_FEELING','NEED_MOTIVE','TRAIT_ATTRIBUTE',\\n 'RELATIONSHIP_TYPE','SYSTEM_CONTEXT','SYSTEM_TYPE','DIMENSION_AXIS',\\n 'TYPOLOGY_CLASS','ORGANIZATIONAL_PROPERTY','FRAME_CONDITION_RESOURCE',\\n 'SOURCE_CITATION_STUDY','QUOTE_STATEMENT','COMMUNICATION_PATTERN',\\n 'RULE_SET_PROTOCOL','CONTACT_IDENTITY','OTHER') DEFAULT 'OTHER'\\ndescription TEXT\\ncanonical_name VARCHAR(255) - Deduplizierung\\nstatus ENUM('extracted','normalized','validated','deprecated','merged')\\ncreated_at DATETIME DEFAULT CURRENT_TIMESTAMP\\nupdated_at DATETIME ON UPDATE CURRENT_TIMESTAMP\\nname_lower VARCHAR(255) STORED GENERATED - Lowercase für Suche\\n<\/pre>\\n\\n<h5>entity_relations<\/h5>\\n<pre>\\nid INT PK AUTO\\nsource_entity_id INT FK\\ntarget_entity_id INT FK\\nrelation_type VARCHAR(100) - z.B. DEVELOPED_BY, RELATED_TO\\nstrength FLOAT DEFAULT 1\\ncontext TEXT\\nchunk_id INT FK - Herkunft\\ncreated_at DATETIME\\n<\/pre>\\n\\n<h5>taxonomy_terms<\/h5>\\n<pre>\\nid INT PK AUTO\\nname VARCHAR(255)\\nslug VARCHAR(255) UNIQUE\\nparent_id INT FK (self-ref)\\ndescription TEXT\\ndepth INT DEFAULT 0\\npath VARCHAR(1000) - z.B. \\\"\/Methoden\/Systemisch\\\"\\ncreated_at DATETIME\\n<\/pre>\\n\\n<h5>chunk_entities<\/h5>\\n<pre>\\nid INT PK AUTO\\nchunk_id INT FK → chunks(id)\\nentity_id INT FK\\nrelevance_score FLOAT DEFAULT 1\\nmention_count INT DEFAULT 1\\nUNIQUE KEY (chunk_id, entity_id)\\n<\/pre>\\n\\n<h5>chunk_taxonomy<\/h5>\\n<pre>\\nid INT PK AUTO\\nchunk_id INT FK → chunks(id)\\ntaxonomy_term_id INT FK → taxonomy_terms(id)\\nconfidence FLOAT DEFAULT 1\\nsource ENUM('auto','manual') DEFAULT 'auto'\\ncreated_at DATETIME DEFAULT CURRENT_TIMESTAMP\\nUNIQUE KEY (chunk_id, taxonomy_term_id)\\n<\/pre>\\n\\n<h5>chunk_semantics<\/h5>\\n<pre>\\nid INT PK AUTO\\nchunk_id INT FK → chunks(id) UNIQUE\\nsummary TEXT\\nkeywords JSON\\nsentiment ENUM('positive','negative','neutral','mixed') DEFAULT 'neutral'\\ntopics JSON\\nlanguage VARCHAR(10) DEFAULT 'de'\\nstatement_form ENUM('assertion','question','command','conditional')\\nintent ENUM('explain','argue','define','compare','exemplify','warn','instruct')\\nframe ENUM('theoretical','practical','historical','methodological','critical')\\nis_negated TINYINT(1) DEFAULT 0\\ndiscourse_role ENUM('thesis','evidence','example','counter','summary','definition')\\nanalyzed_at DATETIME\\nanalysis_model VARCHAR(100)\\nprompt_id INT\\nprompt_version VARCHAR(20)\\n<\/pre>\\n\\n<h4>1.5 Ursprünglich geplante Tabellen (ersetzt)<\/h4>\\n<p>Die ursprünglich im Code vorgesehenen Tabellen wurden durch das Pipeline-Management-System ersetzt:<\/p>\\n<table>\\n<tr><th>Ursprünglich geplant<\/th><th>Ersetzt durch<\/th><th>Status<\/th><\/tr>\\n<tr><td>processing_queue<\/td><td>pipeline_queue<\/td><td>Implementiert<\/td><\/tr>\\n<tr><td>processing_log<\/td><td>pipeline_runs<\/td><td>Implementiert<\/td><\/tr>\\n<tr><td>(neu)<\/td><td>pipeline_configs<\/td><td>Implementiert<\/td><\/tr>\\n<tr><td>(neu)<\/td><td>pipeline_steps<\/td><td>Implementiert<\/td><\/tr>\\n<\/table>\\n<p>Alle vier Pipeline-Tabellen sind Teil des SOLL-Konzepts (Abschnitt 2) und wurden vollständig implementiert.<\/p>\\n\\n<hr>\\n\\n<h3>2. SOLL-Konzept (GUI)<\/h3>\\n\\n<h4>2.1 Anforderungen<\/h4>\\n<ul>\\n<li>Visuelle Darstellung der Pipeline-Schritte<\/li>\\n<li>Konfigurierbare Parameter pro Schritt<\/li>\\n<li>Unterstützung mehrerer Pipeline-Definitionen<\/li>\\n<li>Status-Übersicht für Dokumente<\/li>\\n<li>Manuelle Trigger-Möglichkeit<\/li>\\n<\/ul>\\n\\n<h4>2.2 Tabelle: pipeline_configs (ki_content) ✓<\/h4>\\n<pre>\\nid INT PK AUTO\\nname VARCHAR(100) UNIQUE - z.B. \\\"Standard\\\", \\\"Nur-Embedding\\\"\\ndescription TEXT\\nis_default BOOLEAN DEFAULT FALSE\\nsource_path VARCHAR(500) - Nextcloud-Ordner\\nextensions JSON - [\\\".pdf\\\", \\\".docx\\\", ...]\\nsteps JSON - Aktivierte Steps + Reihenfolge\\ncreated_at, updated_at DATETIME\\n\\nBeispiel steps:\\n[\\n {\\\"step\\\": \\\"detect\\\", \\\"enabled\\\": true, \\\"order\\\": 1},\\n {\\\"step\\\": \\\"extract\\\", \\\"enabled\\\": true, \\\"order\\\": 2, \\\"config\\\": {\\\"ocr\\\": true}},\\n {\\\"step\\\": \\\"chunk\\\", \\\"enabled\\\": true, \\\"order\\\": 3, \\\"config\\\": {\\\"min\\\": 100, \\\"max\\\": 2000, \\\"overlap\\\": 0.1}},\\n {\\\"step\\\": \\\"embed\\\", \\\"enabled\\\": true, \\\"order\\\": 4, \\\"config\\\": {\\\"model\\\": \\\"mxbai-embed-large\\\", \\\"collection\\\": \\\"documents\\\"}},\\n {\\\"step\\\": \\\"analyze\\\", \\\"enabled\\\": false, \\\"order\\\": 5}\\n]\\n<\/pre>\\n\\n<h4>2.3 Tabelle: pipeline_steps (ki_content) ✓<\/h4>\\n<pre>\\nid INT PK AUTO\\npipeline_id INT FK\\nstep_type ENUM('detect','extract','chunk','embed','analyze')\\nconfig JSON - Step-spezifische Einstellungen\\nsort_order INT\\nenabled BOOLEAN DEFAULT TRUE\\ncreated_at, updated_at DATETIME\\n<\/pre>\\n\\n<h4>2.4 Tabelle: pipeline_runs (ki_content) ✓<\/h4>\\n<pre>\\nid INT PK AUTO\\npipeline_id INT FK\\nstatus ENUM('pending','running','completed','failed','cancelled')\\nstarted_at DATETIME\\ncompleted_at DATETIME\\ndocuments_processed INT DEFAULT 0\\ndocuments_failed INT DEFAULT 0\\nerror_log TEXT\\ncreated_at DATETIME\\n<\/pre>\\n\\n<h4>2.5 Tabelle: pipeline_queue (ki_content) ✓<\/h4>\\n<pre>\\nid INT PK AUTO\\npipeline_run_id INT FK\\ndocument_id INT FK\\nstatus ENUM('pending','processing','done','error')\\nstep_index INT\\nerror_message TEXT\\nstarted_at, completed_at DATETIME\\n<\/pre>\\n\\n<h4>2.6 URL-Struktur<\/h4>\\n<pre>\\n\/content-pipeline - Übersicht aller Pipelines\\n\/content-pipeline\/import - Import-Konfiguration (erste Seite)\\n\/content-pipeline\/{id} - Pipeline-Detail\\n\/content-pipeline\/{id}\/run - Pipeline starten (POST)\\n\/content-pipeline\/{id}\/status - Laufender Status\\n\/content-pipeline\/new - Neue Pipeline erstellen\\n<\/pre>\\n\\n<h4>2.7 View-Komponenten<\/h4>\\n<pre>\\n┌─────────────────────────────────────────────────────────┐\\n│ Content Pipeline: Standard [Run] │\\n├─────────────────────────────────────────────────────────┤\\n│ │\\n│ ┌──────┐ ┌─────────┐ ┌───────┐ ┌───────┐ ┌────┐│\\n│ │Detect│ → │ Extrac... [TRUNCATED-ad6986c32e5ce26b]"
}
]
}