}, * analysis: array{processed: int, failed: int, errors: array}, * sync: array{synced: int, failed: int, errors: array}, * duration_seconds: float * } */ public function runFull(): array { $start = microtime(true); echo '=== Doc2Vector Pipeline ===' . PHP_EOL . PHP_EOL; // Stage 1: Chunking echo 'Stage 1: Chunking documents...' . PHP_EOL; $chunkResult = $this->chunking->chunkAll(); echo sprintf( ' Completed: %d documents, %d chunks, %d tokens' . PHP_EOL, $chunkResult['documents'], $chunkResult['chunks'], $chunkResult['tokens'] ); // Stage 2: Analysis echo PHP_EOL . 'Stage 2: LLM Analysis (this may take a while)...' . PHP_EOL; $analysisResult = $this->analysis->analyzeAllPending(Constants::BATCH_LIMIT); echo sprintf( ' Completed: %d analyzed, %d failed' . PHP_EOL, $analysisResult['processed'], $analysisResult['failed'] ); // Stage 3: Sync to Qdrant echo PHP_EOL . 'Stage 3: Syncing to Qdrant...' . PHP_EOL; $syncResult = $this->sync->syncAllPending(Constants::BATCH_LIMIT); echo sprintf( ' Completed: %d synced, %d failed' . PHP_EOL, $syncResult['synced'], $syncResult['failed'] ); $duration = microtime(true) - $start; echo PHP_EOL . sprintf('Pipeline completed in %.1f seconds' . PHP_EOL, $duration); return [ 'chunking' => $chunkResult, 'analysis' => $analysisResult, 'sync' => $syncResult, 'duration_seconds' => $duration, ]; } /** * Processes only new/changed documents. * * @return array */ public function runIncremental(): array { $start = microtime(true); echo '=== Incremental Pipeline ===' . PHP_EOL . PHP_EOL; // Only analyze pending chunks echo 'Analyzing pending chunks...' . PHP_EOL; $analysisResult = $this->analysis->analyzeAllPending(100); echo sprintf(' %d analyzed, %d failed' . PHP_EOL, $analysisResult['processed'], $analysisResult['failed']); // Sync unsynced chunks echo 'Syncing to Qdrant...' . PHP_EOL; $syncResult = $this->sync->syncAllPending(100); echo sprintf(' %d synced, %d failed' . PHP_EOL, $syncResult['synced'], $syncResult['failed']); return [ 'analysis' => $analysisResult, 'sync' => $syncResult, 'duration_seconds' => microtime(true) - $start, ]; } /** * Gets overall pipeline statistics. * * @return array */ public function getStats(): array { $chunkStats = $this->chunking->getStats(); $analysisStats = $this->analysis->getStats(); $qdrantStats = $this->sync->getStats(); return [ 'chunks' => $chunkStats, 'analysis' => $analysisStats, 'qdrant' => $qdrantStats, 'taxonomy_categories' => $this->search->getTaxonomyCategories(), ]; } /** * Performs a search. * * @param array $filters * @return array> */ public function search(string $query, array $filters = [], int $limit = 10): array { return $this->search->search($query, $filters, $limit); } /** * Re-chunks a specific document. * * @return array{chunks_created: int, tokens_total: int} */ public function rechunkDocument(int $docId): array { return $this->chunking->chunkDocument($docId); } /** * Re-analyzes a specific chunk. * * @return array{taxonomy: array, entities: array, keywords: array} */ public function reanalyzeChunk(int $chunkId): array { return $this->analysis->analyzeChunk($chunkId); } }