Security > Firewall) * - Ontology: Entity extraction (technologies, concepts, configurations) * - Semantics: Keywords and glossary terms */ final class ChunkAnalysisService { use JsonDecodeTrait; private const string TAXONOMY_MODEL = 'gemma3:4b-it-qat'; private const int MAX_RETRIES = 3; private const int BATCH_SIZE = 10; private PDO $pdo; private OllamaService $ollama; public function __construct() { $this->ollama = new OllamaService(); $this->pdo = $this->createConnection(); } /** * Analyzes a single chunk. * * @return array{taxonomy: array, entities: array, keywords: array} */ public function analyzeChunk(int $chunkId): array { $chunk = $this->getChunk($chunkId); if ($chunk === null) { throw new RuntimeException("Chunk #{$chunkId} not found"); } // Mark as processing $this->updateStatus($chunkId, 'processing'); try { // Get document context $docContext = $this->getDocumentContext((int) $chunk['dokumentation_id']); // Perform analysis $analysis = $this->performAnalysis($chunk, $docContext); // Store results $this->storeAnalysisResults($chunkId, $analysis); return $analysis; } catch (RuntimeException $e) { $this->updateStatus($chunkId, 'failed', $e->getMessage()); throw $e; } } /** * Analyzes all pending chunks in batches. * * @return array{analyzed: int, failed: int, errors: array} */ public function analyzeAllPending(int $limit = 100): array { $results = ['analyzed' => 0, 'failed' => 0, 'errors' => []]; $chunks = $this->getPendingChunks($limit); foreach ($chunks as $chunk) { try { $this->analyzeChunk((int) $chunk['id']); $results['analyzed']++; // Progress output if ($results['analyzed'] % self::BATCH_SIZE === 0) { echo "Analyzed {$results['analyzed']} chunks...\n"; } } catch (RuntimeException $e) { $results['failed']++; $results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage(); } } return $results; } /** * Performs the actual LLM analysis. * * @param array $chunk * @param array $docContext * @return array{taxonomy: array, entities: array, keywords: array} */ private function performAnalysis(array $chunk, array $docContext): array { $content = $chunk['content_clean'] ?? $chunk['content']; $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null); // Build context $context = sprintf( "Dokument: %s\nPfad: %s\nAbschnitt: %s\n\nInhalt:\n%s", $docContext['title'], $docContext['path'], implode(' > ', $headingPath), $content ); // Combined analysis prompt for efficiency $prompt = $this->buildAnalysisPrompt($context); $response = $this->callLlmWithRetry($prompt, self::TAXONOMY_MODEL); $analysis = $this->parseAnalysisResponse($response); // Fallback: If no taxonomy, derive from document path if (empty($analysis['taxonomy'])) { $analysis['taxonomy'] = $this->deriveTaxonomyFromPath($docContext['path']); } return $analysis; } /** * Builds the analysis prompt. */ private function buildAnalysisPrompt(string $context): string { return << Modul > Thema) - entities: Wichtige Technologien, Konzepte, Konfigurationen, Befehle, Dienste - keywords: 3-5 relevante Suchbegriffe - Antworte NUR mit dem JSON, keine anderen Texte PROMPT; } /** * Calls the LLM with retry logic. */ private function callLlmWithRetry(string $prompt, string $model): string { $lastError = null; for ($attempt = 1; $attempt <= self::MAX_RETRIES; $attempt++) { try { return $this->ollama->generate($prompt, $model); } catch (RuntimeException $e) { $lastError = $e; if ($attempt < self::MAX_RETRIES) { usleep(500000 * $attempt); // Progressive backoff } } } throw new RuntimeException('LLM call failed after ' . self::MAX_RETRIES . ' attempts: ' . ($lastError?->getMessage() ?? 'Unknown error')); } /** * Parses the LLM response into structured data. * * @return array{taxonomy: array, entities: array, keywords: array} */ private function parseAnalysisResponse(string $response): array { $default = [ 'taxonomy' => [], 'entities' => [], 'keywords' => [], ]; // Extract JSON from response (handle markdown code blocks) $json = $response; if (preg_match('/```(?:json)?\s*([\s\S]*?)\s*```/', $response, $matches)) { $json = $matches[1]; } elseif (preg_match('/\{[\s\S]*\}/', $response, $matches)) { $json = $matches[0]; } $decoded = json_decode($json, true); if (!is_array($decoded)) { return $default; } return [ 'taxonomy' => $this->validateArray($decoded['taxonomy'] ?? [], 'string'), 'entities' => $this->validateEntities($decoded['entities'] ?? []), 'keywords' => $this->validateArray($decoded['keywords'] ?? [], 'string'), ]; } /** * Validates an array of strings. * * @param mixed $arr * @return array */ private function validateArray(mixed $arr, string $type): array { if (!is_array($arr)) { return []; } return array_values(array_filter($arr, static fn ($item): bool => is_string($item) && trim($item) !== '')); } /** * Validates entities array. * * @param mixed $entities * @return array */ private function validateEntities(mixed $entities): array { if (!is_array($entities)) { return []; } $result = []; foreach ($entities as $entity) { if (is_array($entity) && isset($entity['name']) && is_string($entity['name'])) { $result[] = [ 'name' => trim($entity['name']), 'type' => isset($entity['type']) && is_string($entity['type']) ? strtoupper($entity['type']) : 'OTHER', ]; } } return $result; } /** * Derives taxonomy from document path. * * @return array */ private function deriveTaxonomyFromPath(string $path): array { $parts = array_filter(explode('/', trim($path, '/'))); // Map common paths to categories $mapping = [ 'server' => 'Server', 'modul' => 'Module', 'anwendungen' => 'Anwendungen', 'mcp' => 'MCP-Server', 'ki-tasks' => 'KI-Tasks', ]; $taxonomy = []; foreach ($parts as $part) { $taxonomy[] = $mapping[$part] ?? ucfirst($part); } return array_slice($taxonomy, 0, 3); } /** * Stores analysis results in the database. * * @param array{taxonomy: array, entities: array, keywords: array} $analysis */ private function storeAnalysisResults(int $chunkId, array $analysis): void { $taxonomyPath = $analysis['taxonomy']; $taxonomyCategory = !empty($taxonomyPath) ? $taxonomyPath[0] : null; $sql = "UPDATE dokumentation_chunks SET taxonomy_category = :category, taxonomy_path = :taxonomy, entities = :entities, keywords = :keywords, analysis_model = :model, analysis_status = 'completed', analysis_error = NULL, analyzed_at = NOW() WHERE id = :id"; $stmt = $this->pdo->prepare($sql); $stmt->execute([ 'id' => $chunkId, 'category' => $taxonomyCategory, 'taxonomy' => json_encode($taxonomyPath), 'entities' => json_encode($analysis['entities']), 'keywords' => json_encode($analysis['keywords']), 'model' => self::TAXONOMY_MODEL, ]); } /** * Updates chunk status. */ private function updateStatus(int $chunkId, string $status, ?string $error = null): void { $sql = 'UPDATE dokumentation_chunks SET analysis_status = :status, analysis_error = :error WHERE id = :id'; $stmt = $this->pdo->prepare($sql); $stmt->execute(['id' => $chunkId, 'status' => $status, 'error' => $error]); } /** * Gets a chunk by ID. * * @return array|null */ private function getChunk(int $id): ?array { $stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id'); $stmt->execute(['id' => $id]); $result = $stmt->fetch(PDO::FETCH_ASSOC); return $result !== false ? $result : null; } /** * Gets pending chunks. * * @return array> */ private function getPendingChunks(int $limit): array { $stmt = $this->pdo->prepare(" SELECT * FROM dokumentation_chunks WHERE analysis_status = 'pending' ORDER BY dokumentation_id, chunk_index LIMIT :limit "); $stmt->bindValue('limit', $limit, PDO::PARAM_INT); $stmt->execute(); return $stmt->fetchAll(PDO::FETCH_ASSOC); } /** * Gets document context. * * @return array{title: string, path: string} */ private function getDocumentContext(int $docId): array { $stmt = $this->pdo->prepare('SELECT title, path FROM dokumentation WHERE id = :id'); $stmt->execute(['id' => $docId]); $result = $stmt->fetch(PDO::FETCH_ASSOC); return [ 'title' => $result['title'] ?? 'Unbekannt', 'path' => $result['path'] ?? '/', ]; } /** * Gets analysis statistics. * * @return array{pending: int, processing: int, completed: int, failed: int, by_category: array} */ public function getStats(): array { $stmt = $this->pdo->query(" SELECT SUM(CASE WHEN analysis_status = 'pending' THEN 1 ELSE 0 END) as pending, SUM(CASE WHEN analysis_status = 'processing' THEN 1 ELSE 0 END) as processing, SUM(CASE WHEN analysis_status = 'completed' THEN 1 ELSE 0 END) as completed, SUM(CASE WHEN analysis_status = 'failed' THEN 1 ELSE 0 END) as failed FROM dokumentation_chunks "); $counts = $stmt->fetch(PDO::FETCH_ASSOC); $stmt = $this->pdo->query(' SELECT taxonomy_category as category, COUNT(*) as count FROM dokumentation_chunks WHERE taxonomy_category IS NOT NULL GROUP BY taxonomy_category ORDER BY count DESC '); $byCategory = $stmt->fetchAll(PDO::FETCH_ASSOC); return [ 'pending' => (int) ($counts['pending'] ?? 0), 'processing' => (int) ($counts['processing'] ?? 0), 'completed' => (int) ($counts['completed'] ?? 0), 'failed' => (int) ($counts['failed'] ?? 0), 'by_category' => $byCategory, ]; } private function createConnection(): PDO { return \Infrastructure\Config\DatabaseFactory::dev(); } }