Backup #427

ID427
Dateipfad/var/www/dev.campus.systemische-tools.de/src/Controller/ContentPipelineController.php
Version5
Typ modified
Größe14.8 KB
Hashcb790bac21ed685af7ac26531d27f12a738ec85fd62461d6bfeefe58d61b2f92
Datum2025-12-22 10:01:55
Geändert vonclaude-code-hook
GrundClaude Code Pre-Hook Backup vor Edit-Operation
Datei existiert Ja

Dateiinhalt

<?php

namespace Controller;

use Framework\Controller;
use Infrastructure\AI\ModelConfig;
use Infrastructure\Persistence\PipelineRepository;

class ContentPipelineController extends Controller
{
    private PipelineRepository $repository;

    public function __construct()
    {
        $this->repository = new PipelineRepository();
    }

    /**
     * GET /content-pipeline
     */
    public function index(): void
    {
        $pipelines = $this->repository->findAll();
        $stats = $this->repository->getStatistics();

        $this->view('content-pipeline.index', [
            'title' => 'Content Pipeline',
            'pipelines' => $pipelines,
            'stats' => $stats,
        ]);
    }

    /**
     * GET /content-pipeline/import
     */
    public function import(): void
    {
        $pipeline = $this->repository->findDefault();

        if ($pipeline === null) {
            $pipelines = $this->repository->findAll(1);
            $pipeline = $pipelines[0] ?? null;
        }

        $latestRun = $pipeline !== null
            ? $this->repository->findLatestRun((int) $pipeline['id'])
            : null;

        $this->view('content-pipeline.import', [
            'title' => 'Import Pipeline',
            'pipeline' => $pipeline,
            'latestRun' => $latestRun,
        ]);
    }

    /**
     * GET /content-pipeline/new
     */
    public function pipelineNew(): void
    {
        $this->view('content-pipeline.form', [
            'title' => 'Neue Pipeline',
            'pipeline' => null,
            'stepTypes' => $this->getStepTypes(),
        ]);
    }

    /**
     * GET /content-pipeline/{id}
     */
    public function show(string $id): void
    {
        $pipeline = $this->repository->findById((int) $id);

        if ($pipeline === null) {
            $this->notFound('Pipeline nicht gefunden');
        }

        $runs = $this->repository->findRuns((int) $id, 10);

        $this->view('content-pipeline.show', [
            'title' => 'Pipeline: ' . $pipeline['name'],
            'pipeline' => $pipeline,
            'runs' => $runs,
            'stepTypes' => $this->getStepTypes(),
        ]);
    }

    /**
     * GET /content-pipeline/{id}/edit
     */
    public function edit(string $id): void
    {
        $pipeline = $this->repository->findById((int) $id);

        if ($pipeline === null) {
            $this->notFound('Pipeline nicht gefunden');
        }

        $this->view('content-pipeline.form', [
            'title' => 'Pipeline bearbeiten: ' . $pipeline['name'],
            'pipeline' => $pipeline,
            'stepTypes' => $this->getStepTypes(),
        ]);
    }

    /**
     * POST /content-pipeline
     */
    public function store(): void
    {
        $this->requireCsrf();

        $name = trim($_POST['name'] ?? '');
        $description = trim($_POST['description'] ?? '');
        $sourcePath = trim($_POST['source_path'] ?? '/var/www/nextcloud/data/root/files/Documents');
        $extensions = $this->parseExtensions($_POST['extensions'] ?? '');
        $isDefault = isset($_POST['is_default']) ? 1 : 0;

        if ($name === '') {
            $_SESSION['error'] = 'Name ist erforderlich.';
            header('Location: /content-pipeline/new');
            exit;
        }

        $pipelineId = $this->repository->create([
            'name' => $name,
            'description' => $description,
            'source_path' => $sourcePath,
            'extensions' => $extensions,
            'is_default' => $isDefault,
        ]);

        // Standard-Steps hinzufuegen
        $this->createDefaultSteps($pipelineId);

        $_SESSION['success'] = 'Pipeline erfolgreich erstellt.';
        header('Location: /content-pipeline/' . $pipelineId);
        exit;
    }

    /**
     * POST /content-pipeline/{id}
     */
    public function update(string $id): void
    {
        $this->requireCsrf();

        $pipeline = $this->repository->findById((int) $id);

        if ($pipeline === null) {
            $this->notFound('Pipeline nicht gefunden');
        }

        $name = trim($_POST['name'] ?? '');
        $description = trim($_POST['description'] ?? '');
        $sourcePath = trim($_POST['source_path'] ?? '');
        $extensions = $this->parseExtensions($_POST['extensions'] ?? '');
        $isDefault = isset($_POST['is_default']) ? 1 : 0;

        if ($name === '') {
            $_SESSION['error'] = 'Name ist erforderlich.';
            header('Location: /content-pipeline/' . $id . '/edit');
            exit;
        }

        $this->repository->update((int) $id, [
            'name' => $name,
            'description' => $description,
            'source_path' => $sourcePath,
            'extensions' => $extensions,
            'is_default' => $isDefault,
        ]);

        $_SESSION['success'] = 'Pipeline aktualisiert.';
        header('Location: /content-pipeline/' . $id);
        exit;
    }

    /**
     * POST /content-pipeline/{id}/run
     */
    public function run(string $id): void
    {
        $this->requireCsrf();

        $pipeline = $this->repository->findById((int) $id);

        if ($pipeline === null) {
            $this->notFound('Pipeline nicht gefunden');
        }

        // Neuen Run erstellen
        $runId = $this->repository->createRun((int) $id);

        // Pipeline im Hintergrund starten
        $pipelineScript = '/opt/scripts/pipeline/pipeline.py';
        $venvPython = '/opt/scripts/pipeline/venv/bin/python';
        $logFile = '/tmp/pipeline_run_' . $runId . '.log';

        $cmd = sprintf(
            'nohup %s %s all --pipeline-id=%d --run-id=%d > %s 2>&1 &',
            escapeshellarg($venvPython),
            escapeshellarg($pipelineScript),
            (int) $id,
            $runId,
            escapeshellarg($logFile)
        );

        exec($cmd);

        $_SESSION['success'] = 'Pipeline gestartet (Run #' . $runId . ')';
        header('Location: /content-pipeline/' . $id);
        exit;
    }

    /**
     * GET /content-pipeline/{id}/status
     * AJAX endpoint for run status
     */
    public function status(string $id): void
    {
        $pipeline = $this->repository->findById((int) $id);

        if ($pipeline === null) {
            $this->json(['error' => 'Pipeline nicht gefunden'], 404);
            return;
        }

        $latestRun = $this->repository->findLatestRun((int) $id);

        $this->json([
            'pipeline_id' => (int) $id,
            'run' => $latestRun,
        ]);
    }

    /**
     * POST /content-pipeline/{id}/steps/{stepId}/toggle
     */
    public function toggleStep(string $id, string $stepId): void
    {
        $this->requireCsrf();

        $pipeline = $this->repository->findById((int) $id);

        if ($pipeline === null) {
            $this->notFound('Pipeline nicht gefunden');
        }

        // Find step and toggle
        foreach ($pipeline['steps'] as $step) {
            if ((int) $step['id'] === (int) $stepId) {
                $this->repository->updateStep((int) $stepId, [
                    'enabled' => $step['enabled'] ? 0 : 1,
                ]);
                break;
            }
        }

        header('Location: /content-pipeline/' . $id);
        exit;
    }

    /**
     * POST /content-pipeline/{id}/delete
     */
    public function delete(string $id): void
    {
        $this->requireCsrf();

        $pipeline = $this->repository->findById((int) $id);

        if ($pipeline === null) {
            $this->notFound('Pipeline nicht gefunden');
        }

        $this->repository->delete((int) $id);

        $_SESSION['success'] = 'Pipeline geloescht.';
        header('Location: /content-pipeline');
        exit;
    }

    /**
     * @return array<string, array<string, mixed>>
     */
    private function getStepTypes(): array
    {
        return [
            // Phase 1: Vorverarbeitung
            'detect' => [
                'label' => 'Erkennung',
                'description' => 'Dateien scannen und Format prüfen',
                'phase' => 'Vorverarbeitung',
            ],
            'validate' => [
                'label' => 'Validierung',
                'description' => 'Datei-Prüfung auf Lesbarkeit und Korruption',
                'phase' => 'Vorverarbeitung',
            ],
            'page_split' => [
                'label' => 'Seitenzerlegung',
                'description' => 'PDF in Einzelseiten zerlegen für Referenz und Vision-Analyse',
                'phase' => 'Vorverarbeitung',
            ],
            'vision_analyze' => [
                'label' => 'Bildanalyse',
                'description' => 'Seiten via Vision-Modell analysieren, Bilder und Grafiken erkennen',
                'phase' => 'Vorverarbeitung',
            ],
            'extract' => [
                'label' => 'Textextraktion',
                'description' => 'Text extrahieren, OCR für Bilder mit Text',
                'phase' => 'Vorverarbeitung',
            ],
            'structure' => [
                'label' => 'Strukturerkennung',
                'description' => 'Überschriften, Listen und Hierarchie erkennen',
                'phase' => 'Vorverarbeitung',
            ],
            'segment' => [
                'label' => 'Abschnitte',
                'description' => 'Logische Dokumentgliederung nach Struktur',
                'phase' => 'Vorverarbeitung',
            ],
            'chunk' => [
                'label' => 'Textbausteine',
                'description' => 'Chunks erstellen (max 800 Token) mit Seitenreferenz',
                'phase' => 'Vorverarbeitung',
            ],
            // Phase 2: Speicherung & Vektorisierung
            'metadata_store' => [
                'label' => 'DB-Speicherung',
                'description' => 'Dokument, Seiten und Chunks in MariaDB speichern',
                'phase' => 'Speicherung',
            ],
            'embed' => [
                'label' => 'Vektorisierung',
                'description' => 'Embeddings mit mxbai-embed-large (1024-dim)',
                'phase' => 'Speicherung',
            ],
            'collection_setup' => [
                'label' => 'Collection',
                'description' => 'Qdrant-Collection einrichten falls nötig',
                'phase' => 'Speicherung',
            ],
            'vector_store' => [
                'label' => 'Vektorspeicherung',
                'description' => 'Vektoren in Qdrant mit MariaDB-ID als Referenz',
                'phase' => 'Speicherung',
            ],
            'index_optimize' => [
                'label' => 'Index-Optimierung',
                'description' => 'HNSW-Index für schnelle Suche optimieren',
                'phase' => 'Speicherung',
            ],
            // Phase 3: Wissensextraktion (3 Ebenen)
            'knowledge_page' => [
                'label' => 'Seiten-Wissen',
                'description' => 'Pro Seite: Entitäten → Semantik → Ontologie → Taxonomie',
                'phase' => 'Wissen',
            ],
            'knowledge_section' => [
                'label' => 'Abschnitt-Wissen',
                'description' => 'Pro Kapitel: Aggregierte Wissensrepräsentation',
                'phase' => 'Wissen',
            ],
            'knowledge_document' => [
                'label' => 'Dokument-Wissen',
                'description' => 'Konsolidierte Gesamtsicht des Dokuments',
                'phase' => 'Wissen',
            ],
            'knowledge_validate' => [
                'label' => 'Wissens-Validierung',
                'description' => 'Abgleich mit DB, Duplikate zusammenführen, neue validieren',
                'phase' => 'Wissen',
            ],
            // Legacy Analyse-Schritte
            'entity_extract' => [
                'label' => 'Entitäten (Legacy)',
                'description' => 'Personen, Organisationen, Konzepte, Methoden erkennen',
                'phase' => 'Analyse',
            ],
            'relation_extract' => [
                'label' => 'Beziehungen (Legacy)',
                'description' => 'Relationen zwischen Entitäten extrahieren',
                'phase' => 'Analyse',
            ],
            'taxonomy_build' => [
                'label' => 'Taxonomie (Legacy)',
                'description' => 'Hierarchische Kategorisierung aufbauen',
                'phase' => 'Analyse',
            ],
            'semantic_analyze' => [
                'label' => 'Semantik (Legacy)',
                'description' => 'Bedeutungs-Analyse, Konzepte und Definitionen',
                'phase' => 'Analyse',
            ],
            'summarize' => [
                'label' => 'Zusammenfassung',
                'description' => 'Dokument- und Seiten-Zusammenfassungen erstellen',
                'phase' => 'Analyse',
            ],
            'question_generate' => [
                'label' => 'Fragengenerierung',
                'description' => 'Beispielfragen für RAG-Chat erstellen',
                'phase' => 'Analyse',
            ],
            'finalize' => [
                'label' => 'Abschluss',
                'description' => 'Status finalisieren und Job beenden',
                'phase' => 'Analyse',
            ],
            // Legacy
            'analyze' => [
                'label' => 'Analyse (Legacy)',
                'description' => 'Kombinierte Analyse (veraltet)',
                'phase' => 'Analyse',
            ],
        ];
    }

    /**
     * @param string $input
     * @return array<string>
     */
    private function parseExtensions(string $input): array
    {
        $extensions = [];
        $parts = preg_split('/[\s,;]+/', $input);

        if ($parts === false) {
            return ['.pdf', '.docx', '.pptx', '.md', '.txt'];
        }

        foreach ($parts as $ext) {
            $ext = trim($ext);
            if ($ext === '') {
                continue;
            }
            if ($ext[0] !== '.') {
                $ext = '.' . $ext;
            }
            $extensions[] = strtolower($ext);
        }

        return $extensions !== [] ? $extensions : ['.pdf', '.docx', '.pptx', '.md', '.txt'];
    }

    private function createDefaultSteps(int $pipelineId): void
    {
        $defaultSteps = [
            ['step_type' => 'detect', 'config' => ['hash_algorithm' => 'sha256'], 'sort_order' => 1, 'enabled' => 1],
            ['step_type' => 'extract', 'config' => ['ocr_enabled' => true, 'ocr_language' => 'deu'], 'sort_order' => 2, 'enabled' => 1],
            ['step_type' => 'chunk', 'config' => ['min_size' => 100, 'max_size' => 2000, 'overlap' => 0.1], 'sort_order' => 3, 'enabled' => 1],
            ['step_type' => 'embed', 'config' => ['model' => 'mxbai-embed-large', 'collection' => 'documents', 'dimensions' => 1024], 'sort_order' => 4, 'enabled' => 1],
            ['step_type' => 'analyze', 'config' => ['extract_entities' => true, 'extract_relations' => true, 'classify_taxonomy' => true], 'sort_order' => 5, 'enabled' => 0],
        ];

        foreach ($defaultSteps as $step) {
            $this->repository->addStep($pipelineId, $step);
        }
    }
}

Vollständig herunterladen

Aktionen

Herunterladen

Andere Versionen dieser Datei

ID Version Typ Größe Datum
1701 33 modified 9.9 KB 2025-12-27 12:20
1683 32 modified 9.9 KB 2025-12-27 12:02
1682 31 modified 9.9 KB 2025-12-27 12:02
1681 30 modified 9.9 KB 2025-12-27 12:02
1299 29 modified 9.8 KB 2025-12-25 13:28
1298 28 modified 10.2 KB 2025-12-25 13:28
1275 27 modified 10.2 KB 2025-12-25 12:52
680 26 modified 10.2 KB 2025-12-23 07:44
678 25 modified 10.1 KB 2025-12-23 07:39
605 24 modified 10.4 KB 2025-12-23 04:39
603 23 modified 11.2 KB 2025-12-23 04:31
602 22 modified 10.9 KB 2025-12-23 04:31
601 21 modified 11.3 KB 2025-12-23 04:30
600 20 modified 13.3 KB 2025-12-23 04:30
599 19 modified 13.4 KB 2025-12-23 04:30
598 18 modified 13.9 KB 2025-12-23 04:30
597 17 modified 13.4 KB 2025-12-23 04:29
585 16 modified 13.3 KB 2025-12-23 04:24
577 15 modified 12.1 KB 2025-12-23 04:19
573 14 modified 8.0 KB 2025-12-23 04:11
572 13 modified 8.0 KB 2025-12-23 04:10
528 12 modified 20.4 KB 2025-12-22 19:03
477 11 modified 18.6 KB 2025-12-22 15:21
476 10 modified 18.5 KB 2025-12-22 15:21
475 9 modified 18.4 KB 2025-12-22 15:20
472 8 modified 18.4 KB 2025-12-22 15:19
439 7 modified 16.5 KB 2025-12-22 10:14
428 6 modified 14.9 KB 2025-12-22 10:02
427 5 modified 14.8 KB 2025-12-22 10:01
426 4 modified 14.8 KB 2025-12-22 10:01
421 3 modified 13.8 KB 2025-12-22 09:55
419 2 modified 13.8 KB 2025-12-22 09:42
418 1 modified 10.6 KB 2025-12-22 09:35

← Zurück zur Übersicht