Doc2VectorPipeline.php
- Pfad:
src/Infrastructure/Docs/Doc2VectorPipeline.php - Namespace: Infrastructure\Docs
- Zeilen: 155 | Größe: 4,685 Bytes
- Geändert: 2025-12-27 23:46:05 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 100
- Dependencies: 100 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 5
- constructor Infrastructure\Docs\ChunkingService
- constructor Infrastructure\Docs\ChunkAnalysisService
- constructor Infrastructure\Docs\ChunkSyncService
- constructor Infrastructure\Docs\HybridSearchService
- use Domain\Constants
Klassen 1
-
Doc2VectorPipelineclass Zeile 11
Funktionen 7
-
__construct()public Zeile 13 -
runFull()public Zeile 31 -
runIncremental()public Zeile 82 -
getStats()public Zeile 110 -
search()public Zeile 130 -
rechunkDocument()public Zeile 140 -
reanalyzeChunk()public Zeile 150
Versionen 10
-
v10
2025-12-27 23:46 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v9
2025-12-27 23:45 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v8
2025-12-27 23:45 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v7
2025-12-27 23:45 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v6
2025-12-27 23:45 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v5
2025-12-25 17:32 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v4
2025-12-25 17:31 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v3
2025-12-25 17:31 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v2
2025-12-23 08:50 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v1
2025-12-23 08:05 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation
Code
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
// @responsibility: Orchestriert Doc2Vector-Pipeline (Chunking → Analyse → Sync)
use Domain\Constants;
final class Doc2VectorPipeline
{
public function __construct(
private ChunkingService $chunking,
private ChunkAnalysisService $analysis,
private ChunkSyncService $sync,
private HybridSearchService $search
) {
}
/**
* Runs the full pipeline.
*
* @return array{
* chunking: array{documents: int, chunks: int, tokens: int, errors: array<string>},
* analysis: array{processed: int, failed: int, errors: array<string>},
* sync: array{synced: int, failed: int, errors: array<string>},
* duration_seconds: float
* }
*/
public function runFull(): array
{
$start = microtime(true);
echo '=== Doc2Vector Pipeline ===' . PHP_EOL . PHP_EOL;
// Stage 1: Chunking
echo 'Stage 1: Chunking documents...' . PHP_EOL;
$chunkResult = $this->chunking->chunkAll();
echo sprintf(
' Completed: %d documents, %d chunks, %d tokens' . PHP_EOL,
$chunkResult['documents'],
$chunkResult['chunks'],
$chunkResult['tokens']
);
// Stage 2: Analysis
echo PHP_EOL . 'Stage 2: LLM Analysis (this may take a while)...' . PHP_EOL;
$analysisResult = $this->analysis->analyzeAllPending(Constants::BATCH_LIMIT);
echo sprintf(
' Completed: %d analyzed, %d failed' . PHP_EOL,
$analysisResult['processed'],
$analysisResult['failed']
);
// Stage 3: Sync to Qdrant
echo PHP_EOL . 'Stage 3: Syncing to Qdrant...' . PHP_EOL;
$syncResult = $this->sync->syncAllPending(Constants::BATCH_LIMIT);
echo sprintf(
' Completed: %d synced, %d failed' . PHP_EOL,
$syncResult['synced'],
$syncResult['failed']
);
$duration = microtime(true) - $start;
echo PHP_EOL . sprintf('Pipeline completed in %.1f seconds' . PHP_EOL, $duration);
return [
'chunking' => $chunkResult,
'analysis' => $analysisResult,
'sync' => $syncResult,
'duration_seconds' => $duration,
];
}
/**
* Processes only new/changed documents.
*
* @return array<string, mixed>
*/
public function runIncremental(): array
{
$start = microtime(true);
echo '=== Incremental Pipeline ===' . PHP_EOL . PHP_EOL;
// Only analyze pending chunks
echo 'Analyzing pending chunks...' . PHP_EOL;
$analysisResult = $this->analysis->analyzeAllPending(Constants::DEFAULT_LIMIT);
echo sprintf(' %d analyzed, %d failed' . PHP_EOL, $analysisResult['processed'], $analysisResult['failed']);
// Sync unsynced chunks
echo 'Syncing to Qdrant...' . PHP_EOL;
$syncResult = $this->sync->syncAllPending(Constants::DEFAULT_LIMIT);
echo sprintf(' %d synced, %d failed' . PHP_EOL, $syncResult['synced'], $syncResult['failed']);
return [
'analysis' => $analysisResult,
'sync' => $syncResult,
'duration_seconds' => microtime(true) - $start,
];
}
/**
* Gets overall pipeline statistics.
*
* @return array<string, mixed>
*/
public function getStats(): array
{
$chunkStats = $this->chunking->getStats();
$analysisStats = $this->analysis->getStats();
$qdrantStats = $this->sync->getStats();
return [
'chunks' => $chunkStats,
'analysis' => $analysisStats,
'qdrant' => $qdrantStats,
'taxonomy_categories' => $this->search->getTaxonomyCategories(),
];
}
/**
* Performs a search.
*
* @param array<string, mixed> $filters
* @return array<array<string, mixed>>
*/
public function search(string $query, array $filters = [], int $limit = 10): array
{
return $this->search->search($query, $filters, $limit);
}
/**
* Re-chunks a specific document.
*
* @return array{chunks_created: int, tokens_total: int}
*/
public function rechunkDocument(int $docId): array
{
return $this->chunking->chunkDocument($docId);
}
/**
* Re-analyzes a specific chunk.
*
* @return array{taxonomy: array<string>, entities: array<mixed>, keywords: array<string>}
*/
public function reanalyzeChunk(int $chunkId): array
{
return $this->analysis->analyzeChunk($chunkId);
}
}