repository = new PipelineRepository(); } /** * GET /content-pipeline */ public function index(): void { $pipelines = $this->repository->findAll(); $stats = $this->repository->getStatistics(); $this->view('content-pipeline.index', [ 'title' => 'Content Pipeline', 'pipelines' => $pipelines, 'stats' => $stats, ]); } /** * GET /content-pipeline/import */ public function import(): void { $pipeline = $this->repository->findDefault(); if ($pipeline === null) { $pipelines = $this->repository->findAll(1); $pipeline = $pipelines[0] ?? null; } $latestRun = $pipeline !== null ? $this->repository->findLatestRun((int) $pipeline['id']) : null; $this->view('content-pipeline.import', [ 'title' => 'Import Pipeline', 'pipeline' => $pipeline, 'latestRun' => $latestRun, ]); } /** * GET /content-pipeline/new */ public function pipelineNew(): void { $this->view('content-pipeline.form', [ 'title' => 'Neue Pipeline', 'pipeline' => null, 'stepTypes' => $this->getStepTypes(), ]); } /** * GET /content-pipeline/{id} */ public function show(string $id): void { $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->notFound('Pipeline nicht gefunden'); } $runs = $this->repository->findRuns((int) $id, 10); $this->view('content-pipeline.show', [ 'title' => 'Pipeline: ' . $pipeline['name'], 'pipeline' => $pipeline, 'runs' => $runs, 'stepTypes' => $this->getStepTypes(), 'models' => ModelConfig::getAll(), 'defaultModel' => ModelConfig::DEFAULT_MODEL, ]); } /** * GET /content-pipeline/{id}/edit */ public function edit(string $id): void { $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->notFound('Pipeline nicht gefunden'); } $this->view('content-pipeline.form', [ 'title' => 'Pipeline bearbeiten: ' . $pipeline['name'], 'pipeline' => $pipeline, 'stepTypes' => $this->getStepTypes(), ]); } /** * POST /content-pipeline */ public function store(): void { $this->requireCsrf(); $name = trim($_POST['name'] ?? ''); $description = trim($_POST['description'] ?? ''); $sourcePath = trim($_POST['source_path'] ?? '/var/www/nextcloud/data/root/files/Documents'); $extensions = $this->parseExtensions($_POST['extensions'] ?? ''); $isDefault = isset($_POST['is_default']) ? 1 : 0; if ($name === '') { $_SESSION['error'] = 'Name ist erforderlich.'; header('Location: /content-pipeline/new'); exit; } $pipelineId = $this->repository->create([ 'name' => $name, 'description' => $description, 'source_path' => $sourcePath, 'extensions' => $extensions, 'is_default' => $isDefault, ]); // Standard-Steps hinzufuegen $this->createDefaultSteps($pipelineId); $_SESSION['success'] = 'Pipeline erfolgreich erstellt.'; header('Location: /content-pipeline/' . $pipelineId); exit; } /** * POST /content-pipeline/{id} */ public function update(string $id): void { $this->requireCsrf(); $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->notFound('Pipeline nicht gefunden'); } $name = trim($_POST['name'] ?? ''); $description = trim($_POST['description'] ?? ''); $sourcePath = trim($_POST['source_path'] ?? ''); $extensions = $this->parseExtensions($_POST['extensions'] ?? ''); $isDefault = isset($_POST['is_default']) ? 1 : 0; if ($name === '') { $_SESSION['error'] = 'Name ist erforderlich.'; header('Location: /content-pipeline/' . $id . '/edit'); exit; } $this->repository->update((int) $id, [ 'name' => $name, 'description' => $description, 'source_path' => $sourcePath, 'extensions' => $extensions, 'is_default' => $isDefault, ]); $_SESSION['success'] = 'Pipeline aktualisiert.'; header('Location: /content-pipeline/' . $id); exit; } /** * POST /content-pipeline/{id}/run */ public function run(string $id): void { $this->requireCsrf(); $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->notFound('Pipeline nicht gefunden'); } // Neuen Run erstellen $runId = $this->repository->createRun((int) $id); // Pipeline im Hintergrund starten $pipelineScript = '/opt/scripts/pipeline/pipeline.py'; $venvPython = '/opt/scripts/pipeline/venv/bin/python'; $logFile = '/tmp/pipeline_run_' . $runId . '.log'; $cmd = sprintf( 'nohup %s %s all --pipeline-id=%d --run-id=%d > %s 2>&1 &', escapeshellarg($venvPython), escapeshellarg($pipelineScript), (int) $id, $runId, escapeshellarg($logFile) ); exec($cmd); $_SESSION['success'] = 'Pipeline gestartet (Run #' . $runId . ')'; header('Location: /content-pipeline/' . $id); exit; } /** * GET /content-pipeline/{id}/status * AJAX endpoint for run status */ public function status(string $id): void { $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->json(['error' => 'Pipeline nicht gefunden'], 404); return; } $latestRun = $this->repository->findLatestRun((int) $id); $this->json([ 'pipeline_id' => (int) $id, 'run' => $latestRun, ]); } /** * POST /content-pipeline/{id}/steps/{stepId}/toggle */ public function toggleStep(string $id, string $stepId): void { $this->requireCsrf(); $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->notFound('Pipeline nicht gefunden'); } // Find step and toggle foreach ($pipeline['steps'] as $step) { if ((int) $step['id'] === (int) $stepId) { $this->repository->updateStep((int) $stepId, [ 'enabled' => $step['enabled'] ? 0 : 1, ]); break; } } header('Location: /content-pipeline/' . $id); exit; } /** * POST /content-pipeline/{id}/steps/{stepId}/model (AJAX) * Update step model configuration */ public function updateStepModel(string $id, string $stepId): void { $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->json(['error' => 'Pipeline nicht gefunden'], 404); return; } $model = trim($_POST['model'] ?? ''); if ($model === '' || !ModelConfig::isValid($model)) { $this->json(['error' => 'Ungültiges Modell'], 400); return; } // Find step $stepFound = false; foreach ($pipeline['steps'] as $step) { if ((int) $step['id'] === (int) $stepId) { $stepFound = true; $config = $step['config'] ?? []; // Determine provider from model $provider = ModelConfig::isLocal($model) ? 'ollama' : 'anthropic'; // Update config with new model $config['model'] = ModelConfig::isLocal($model) ? substr($model, 7) // Remove 'ollama:' prefix : $model; $config['provider'] = $provider; $this->repository->updateStep((int) $stepId, [ 'config' => $config, ]); break; } } if (!$stepFound) { $this->json(['error' => 'Schritt nicht gefunden'], 404); return; } $this->json([ 'success' => true, 'model' => $model, 'label' => ModelConfig::getLabel($model), ]); } /** * POST /content-pipeline/{id}/delete */ public function delete(string $id): void { $this->requireCsrf(); $pipeline = $this->repository->findById((int) $id); if ($pipeline === null) { $this->notFound('Pipeline nicht gefunden'); } $this->repository->delete((int) $id); $_SESSION['success'] = 'Pipeline geloescht.'; header('Location: /content-pipeline'); exit; } /** * @return array> */ private function getStepTypes(): array { return [ // Phase 1: Vorverarbeitung 'detect' => [ 'label' => 'Erkennung', 'description' => 'Dateien scannen und Format prüfen', 'phase' => 'Vorverarbeitung', 'storage' => null, ], 'validate' => [ 'label' => 'Validierung', 'description' => 'Datei-Prüfung auf Lesbarkeit und Korruption', 'phase' => 'Vorverarbeitung', 'storage' => null, ], 'page_split' => [ 'label' => 'Seitenzerlegung', 'description' => 'PDF in Einzelseiten zerlegen für Referenz und Vision-Analyse', 'phase' => 'Vorverarbeitung', 'storage' => 'ki_content.document_pages', ], 'vision_analyze' => [ 'label' => 'Bildanalyse', 'description' => 'Seiten via Vision-Modell analysieren, Bilder und Grafiken erkennen', 'phase' => 'Vorverarbeitung', 'storage' => 'ki_content.document_pages (vision_analysis)', 'uses_vision' => true, ], 'extract' => [ 'label' => 'Textextraktion', 'description' => 'Text extrahieren, OCR für Bilder mit Text', 'phase' => 'Vorverarbeitung', 'storage' => null, ], 'structure' => [ 'label' => 'Strukturerkennung', 'description' => 'Überschriften, Listen und Hierarchie erkennen', 'phase' => 'Vorverarbeitung', 'storage' => 'ki_content.document_sections', ], 'segment' => [ 'label' => 'Abschnitte', 'description' => 'Logische Dokumentgliederung nach Struktur', 'phase' => 'Vorverarbeitung', 'storage' => 'ki_content.document_sections', ], 'chunk' => [ 'label' => 'Textbausteine', 'description' => 'Chunks erstellen (max 800 Token) mit Seitenreferenz', 'phase' => 'Vorverarbeitung', 'storage' => 'ki_content.chunks', ], // Phase 2: Speicherung & Vektorisierung 'metadata_store' => [ 'label' => 'DB-Speicherung', 'description' => 'Dokument, Seiten und Chunks in MariaDB speichern', 'phase' => 'Speicherung', 'storage' => 'ki_content.documents, .document_pages, .chunks', ], 'embed' => [ 'label' => 'Vektorisierung', 'description' => 'Embeddings erstellen für Vektor-Suche', 'phase' => 'Speicherung', 'storage' => null, 'fixed_model' => 'mxbai-embed-large (1024-dim)', ], 'collection_setup' => [ 'label' => 'Collection', 'description' => 'Qdrant-Collection einrichten falls nötig', 'phase' => 'Speicherung', 'storage' => 'Qdrant: {collection}', ], 'vector_store' => [ 'label' => 'Vektorspeicherung', 'description' => 'Vektoren in Qdrant mit MariaDB-ID als Referenz', 'phase' => 'Speicherung', 'storage' => 'Qdrant: {collection}', ], 'index_optimize' => [ 'label' => 'Index-Optimierung', 'description' => 'HNSW-Index für schnelle Suche optimieren', 'phase' => 'Speicherung', 'storage' => 'Qdrant: {collection}', ], // Phase 3: Wissensextraktion (3 Ebenen) 'knowledge_page' => [ 'label' => 'Seiten-Wissen', 'description' => 'Pro Seite: Entitäten → Semantik → Ontologie → Taxonomie', 'phase' => 'Wissen', 'storage' => 'ki_content.page_knowledge, .entities, .entity_semantics', 'uses_llm' => true, ], 'knowledge_section' => [ 'label' => 'Abschnitt-Wissen', 'description' => 'Pro Kapitel: Aggregierte Wissensrepräsentation', 'phase' => 'Wissen', 'storage' => 'ki_content.section_knowledge', 'uses_llm' => true, ], 'knowledge_document' => [ 'label' => 'Dokument-Wissen', 'description' => 'Konsolidierte Gesamtsicht des Dokuments', 'phase' => 'Wissen', 'storage' => 'ki_content.document_knowledge', 'uses_llm' => true, ], 'knowledge_validate' => [ 'label' => 'Wissens-Validierung', 'description' => 'Abgleich mit DB, Duplikate zusammenführen, neue validieren', 'phase' => 'Wissen', 'storage' => 'ki_content.entities (merged)', ], // Legacy Analyse-Schritte 'entity_extract' => [ 'label' => 'Entitäten (Legacy)', 'description' => 'Personen, Organisationen, Konzepte, Methoden erkennen', 'phase' => 'Analyse', 'storage' => 'ki_content.chunk_entities', 'uses_llm' => true, ], 'relation_extract' => [ 'label' => 'Beziehungen (Legacy)', 'description' => 'Relationen zwischen Entitäten extrahieren', 'phase' => 'Analyse', 'storage' => 'ki_content.entity_relations', 'uses_llm' => true, ], 'taxonomy_build' => [ 'label' => 'Taxonomie (Legacy)', 'description' => 'Hierarchische Kategorisierung aufbauen', 'phase' => 'Analyse', 'storage' => 'ki_content.chunk_taxonomy, .taxonomy_terms', 'uses_llm' => true, ], 'semantic_analyze' => [ 'label' => 'Semantik (Legacy)', 'description' => 'Bedeutungs-Analyse, Konzepte und Definitionen', 'phase' => 'Analyse', 'storage' => 'ki_content.chunk_semantics', 'uses_llm' => true, ], 'summarize' => [ 'label' => 'Zusammenfassung', 'description' => 'Dokument- und Seiten-Zusammenfassungen erstellen', 'phase' => 'Analyse', 'storage' => 'ki_content.documents (summary), .document_pages', 'uses_llm' => true, ], 'question_generate' => [ 'label' => 'Fragengenerierung', 'description' => 'Beispielfragen für RAG-Chat erstellen', 'phase' => 'Analyse', 'storage' => 'ki_content.generated_questions', 'uses_llm' => true, ], 'finalize' => [ 'label' => 'Abschluss', 'description' => 'Status finalisieren und Job beenden', 'phase' => 'Analyse', 'storage' => 'ki_content.documents (status)', ], // Legacy 'analyze' => [ 'label' => 'Analyse (Legacy)', 'description' => 'Kombinierte Analyse (veraltet)', 'phase' => 'Analyse', 'storage' => 'ki_content.chunk_entities, .chunk_semantics', 'uses_llm' => true, ], ]; } /** * @param string $input * @return array */ private function parseExtensions(string $input): array { $extensions = []; $parts = preg_split('/[\s,;]+/', $input); if ($parts === false) { return ['.pdf', '.docx', '.pptx', '.md', '.txt']; } foreach ($parts as $ext) { $ext = trim($ext); if ($ext === '') { continue; } if ($ext[0] !== '.') { $ext = '.' . $ext; } $extensions[] = strtolower($ext); } return $extensions !== [] ? $extensions : ['.pdf', '.docx', '.pptx', '.md', '.txt']; } private function createDefaultSteps(int $pipelineId): void { $defaultSteps = [ ['step_type' => 'detect', 'config' => ['hash_algorithm' => 'sha256'], 'sort_order' => 1, 'enabled' => 1], ['step_type' => 'extract', 'config' => ['ocr_enabled' => true, 'ocr_language' => 'deu'], 'sort_order' => 2, 'enabled' => 1], ['step_type' => 'chunk', 'config' => ['min_size' => 100, 'max_size' => 2000, 'overlap' => 0.1], 'sort_order' => 3, 'enabled' => 1], ['step_type' => 'embed', 'config' => ['model' => 'mxbai-embed-large', 'collection' => 'documents', 'dimensions' => 1024], 'sort_order' => 4, 'enabled' => 1], ['step_type' => 'analyze', 'config' => ['extract_entities' => true, 'extract_relations' => true, 'classify_taxonomy' => true], 'sort_order' => 5, 'enabled' => 0], ]; foreach ($defaultSteps as $step) { $this->repository->addStep($pipelineId, $step); } } }