Doc2VectorPipeline.php

Code Hygiene Score: 100

Keine Issues gefunden.

Dependencies 5

Klassen 1

Funktionen 7

Versionen 10

Code

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

// @responsibility: Orchestriert Doc2Vector-Pipeline (Chunking → Analyse → Sync)

use Domain\Constants;

final class Doc2VectorPipeline
{
    public function __construct(
        private ChunkingService $chunking,
        private ChunkAnalysisService $analysis,
        private ChunkSyncService $sync,
        private HybridSearchService $search
    ) {
    }

    /**
     * Runs the full pipeline.
     *
     * @return array{
     *     chunking: array{documents: int, chunks: int, tokens: int, errors: array<string>},
     *     analysis: array{processed: int, failed: int, errors: array<string>},
     *     sync: array{synced: int, failed: int, errors: array<string>},
     *     duration_seconds: float
     * }
     */
    public function runFull(): array
    {
        $start = microtime(true);

        echo '=== Doc2Vector Pipeline ===' . PHP_EOL . PHP_EOL;

        // Stage 1: Chunking
        echo 'Stage 1: Chunking documents...' . PHP_EOL;
        $chunkResult = $this->chunking->chunkAll();
        echo sprintf(
            '  Completed: %d documents, %d chunks, %d tokens' . PHP_EOL,
            $chunkResult['documents'],
            $chunkResult['chunks'],
            $chunkResult['tokens']
        );

        // Stage 2: Analysis
        echo PHP_EOL . 'Stage 2: LLM Analysis (this may take a while)...' . PHP_EOL;
        $analysisResult = $this->analysis->analyzeAllPending(Constants::BATCH_LIMIT);
        echo sprintf(
            '  Completed: %d analyzed, %d failed' . PHP_EOL,
            $analysisResult['processed'],
            $analysisResult['failed']
        );

        // Stage 3: Sync to Qdrant
        echo PHP_EOL . 'Stage 3: Syncing to Qdrant...' . PHP_EOL;
        $syncResult = $this->sync->syncAllPending(Constants::BATCH_LIMIT);
        echo sprintf(
            '  Completed: %d synced, %d failed' . PHP_EOL,
            $syncResult['synced'],
            $syncResult['failed']
        );

        $duration = microtime(true) - $start;

        echo PHP_EOL . sprintf('Pipeline completed in %.1f seconds' . PHP_EOL, $duration);

        return [
            'chunking' => $chunkResult,
            'analysis' => $analysisResult,
            'sync' => $syncResult,
            'duration_seconds' => $duration,
        ];
    }

    /**
     * Processes only new/changed documents.
     *
     * @return array<string, mixed>
     */
    public function runIncremental(): array
    {
        $start = microtime(true);

        echo '=== Incremental Pipeline ===' . PHP_EOL . PHP_EOL;

        // Only analyze pending chunks
        echo 'Analyzing pending chunks...' . PHP_EOL;
        $analysisResult = $this->analysis->analyzeAllPending(Constants::DEFAULT_LIMIT);
        echo sprintf('  %d analyzed, %d failed' . PHP_EOL, $analysisResult['processed'], $analysisResult['failed']);

        // Sync unsynced chunks
        echo 'Syncing to Qdrant...' . PHP_EOL;
        $syncResult = $this->sync->syncAllPending(Constants::DEFAULT_LIMIT);
        echo sprintf('  %d synced, %d failed' . PHP_EOL, $syncResult['synced'], $syncResult['failed']);

        return [
            'analysis' => $analysisResult,
            'sync' => $syncResult,
            'duration_seconds' => microtime(true) - $start,
        ];
    }

    /**
     * Gets overall pipeline statistics.
     *
     * @return array<string, mixed>
     */
    public function getStats(): array
    {
        $chunkStats = $this->chunking->getStats();
        $analysisStats = $this->analysis->getStats();
        $qdrantStats = $this->sync->getStats();

        return [
            'chunks' => $chunkStats,
            'analysis' => $analysisStats,
            'qdrant' => $qdrantStats,
            'taxonomy_categories' => $this->search->getTaxonomyCategories(),
        ];
    }

    /**
     * Performs a search.
     *
     * @param array<string, mixed> $filters
     * @return array<array<string, mixed>>
     */
    public function search(string $query, array $filters = [], int $limit = 10): array
    {
        return $this->search->search($query, $filters, $limit);
    }

    /**
     * Re-chunks a specific document.
     *
     * @return array{chunks_created: int, tokens_total: int}
     */
    public function rechunkDocument(int $docId): array
    {
        return $this->chunking->chunkDocument($docId);
    }

    /**
     * Re-analyzes a specific chunk.
     *
     * @return array{taxonomy: array<string>, entities: array<mixed>, keywords: array<string>}
     */
    public function reanalyzeChunk(int $chunkId): array
    {
        return $this->analysis->analyzeChunk($chunkId);
    }
}
← Übersicht Graph